Sean Pedrick-Case commited on
Commit
10da194
·
unverified ·
2 Parent(s): 95ca426 3946be6

Merge pull request #49 from seanpedrick-case/dev_new

Browse files

Revamped duplicate page/subdocument removal, CDK code, updated documentation, read-only file system compatability.

.dockerignore CHANGED
@@ -4,10 +4,9 @@
4
  *.jpg
5
  *.png
6
  *.ipynb
 
7
  examples/*
8
  processing/*
9
- input/*
10
- output/*
11
  tools/__pycache__/*
12
  old_code/*
13
  tesseract/*
@@ -15,9 +14,18 @@ poppler/*
15
  build/*
16
  dist/*
17
  build_deps/*
18
- logs/*
19
- config/*
20
  user_guide/*
21
- cdk/*
22
  cdk/config/*
23
- web/*
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  *.jpg
5
  *.png
6
  *.ipynb
7
+ *.pyc
8
  examples/*
9
  processing/*
 
 
10
  tools/__pycache__/*
11
  old_code/*
12
  tesseract/*
 
14
  build/*
15
  dist/*
16
  build_deps/*
 
 
17
  user_guide/*
 
18
  cdk/config/*
19
+ tld/*
20
+ cdk/config/*
21
+ cdk/cdk.out/*
22
+ cdk/archive/*
23
+ cdk.json
24
+ cdk.context.json
25
+ .quarto/*
26
+ logs/
27
+ output/
28
+ input/
29
+ feedback/
30
+ config/
31
+ usage/
.gitignore CHANGED
@@ -4,6 +4,7 @@
4
  *.jpg
5
  *.png
6
  *.ipynb
 
7
  examples/*
8
  processing/*
9
  input/*
@@ -19,6 +20,14 @@ logs/*
19
  config/*
20
  doc_redaction_amplify_app/*
21
  user_guide/*
22
- cdk/*
23
  cdk/config/*
24
- web/*
 
 
 
 
 
 
 
 
 
 
4
  *.jpg
5
  *.png
6
  *.ipynb
7
+ *.pyc
8
  examples/*
9
  processing/*
10
  input/*
 
20
  config/*
21
  doc_redaction_amplify_app/*
22
  user_guide/*
 
23
  cdk/config/*
24
+ cdk/cdk.out/*
25
+ cdk/archive/*
26
+ tld/*
27
+ tmp/*
28
+ cdk.out/*
29
+ cdk.json
30
+ cdk.context.json
31
+ .quarto/*
32
+ /.quarto/
33
+ /_site/
Dockerfile CHANGED
@@ -1,14 +1,14 @@
1
  # Stage 1: Build dependencies and download models
2
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
3
 
4
- # Install system dependencies. Need to specify -y for poppler to get it to install
5
  RUN apt-get update \
6
  && apt-get install -y \
7
  g++ \
8
  make \
9
  cmake \
10
  unzip \
11
- libcurl4-openssl-dev \
12
  git \
13
  && apt-get clean \
14
  && rm -rf /var/lib/apt/lists/*
@@ -17,28 +17,20 @@ WORKDIR /src
17
 
18
  COPY requirements.txt .
19
 
20
- RUN pip install --no-cache-dir --target=/install -r requirements.txt
21
-
22
- RUN rm requirements.txt
23
 
24
- # Add lambda_entrypoint.py to the container
25
  COPY lambda_entrypoint.py .
26
-
27
  COPY entrypoint.sh .
28
 
29
  # Stage 2: Final runtime image
30
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
31
 
32
- # Define a build argument with a default value
33
  ARG APP_MODE=gradio
34
-
35
- # Echo the APP_MODE during the build to confirm its value
36
- RUN echo "APP_MODE is set to: ${APP_MODE}"
37
-
38
- # Set APP_MODE as an environment variable for runtime
39
  ENV APP_MODE=${APP_MODE}
40
 
41
- # Install system dependencies
42
  RUN apt-get update \
43
  && apt-get install -y \
44
  tesseract-ocr \
@@ -48,30 +40,85 @@ RUN apt-get update \
48
  && apt-get clean \
49
  && rm -rf /var/lib/apt/lists/*
50
 
51
- # Set up a new user named "user" with user ID 1000
52
  RUN useradd -m -u 1000 user
 
53
 
54
- # Create required directories
55
- RUN mkdir -p /home/user/app/{output,input,tld,logs,usage,feedback,config} \
56
- && chown -R user:user /home/user/app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Copy installed packages from builder stage
59
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
60
 
61
- # Download NLTK data packages - now no longer necessary
62
- # RUN python -m nltk.downloader --quiet punkt stopwords punkt_tab
63
 
64
- # Entrypoint helps to switch between Gradio and Lambda mode
65
  COPY entrypoint.sh /entrypoint.sh
66
-
67
  RUN chmod +x /entrypoint.sh
68
 
69
- # Switch to the "user" user
70
  USER user
71
 
72
- ENV APP_HOME=/home/user
 
73
 
74
- # Set environmental variables
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  ENV PATH=$APP_HOME/.local/bin:$PATH \
76
  PYTHONPATH=$APP_HOME/app \
77
  PYTHONUNBUFFERED=1 \
@@ -80,20 +127,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
80
  GRADIO_NUM_PORTS=1 \
81
  GRADIO_SERVER_NAME=0.0.0.0 \
82
  GRADIO_SERVER_PORT=7860 \
83
- GRADIO_ANALYTICS_ENABLED=False \
84
- TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
85
- SYSTEM=spaces
86
-
87
- # Set the working directory to the user's home directory
88
- WORKDIR $APP_HOME/app
89
-
90
- # Copy the app code to the container
91
- COPY --chown=user . $APP_HOME/app
92
-
93
- # Ensure permissions are really user:user again after copying
94
- RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
95
 
96
- ENTRYPOINT [ "/entrypoint.sh" ]
97
 
98
- # Default command for Lambda mode
99
- CMD [ "lambda_entrypoint.lambda_handler" ]
 
1
  # Stage 1: Build dependencies and download models
2
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
3
 
4
+ # Install system dependencies
5
  RUN apt-get update \
6
  && apt-get install -y \
7
  g++ \
8
  make \
9
  cmake \
10
  unzip \
11
+ libcurl4-openssl-dev \
12
  git \
13
  && apt-get clean \
14
  && rm -rf /var/lib/apt/lists/*
 
17
 
18
  COPY requirements.txt .
19
 
20
+ RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
 
 
21
 
22
+ # Add lambda entrypoint and script
23
  COPY lambda_entrypoint.py .
 
24
  COPY entrypoint.sh .
25
 
26
  # Stage 2: Final runtime image
27
  FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
28
 
29
+ # Set build-time and runtime environment variable
30
  ARG APP_MODE=gradio
 
 
 
 
 
31
  ENV APP_MODE=${APP_MODE}
32
 
33
+ # Install runtime dependencies
34
  RUN apt-get update \
35
  && apt-get install -y \
36
  tesseract-ocr \
 
40
  && apt-get clean \
41
  && rm -rf /var/lib/apt/lists/*
42
 
43
+ # Create non-root user
44
  RUN useradd -m -u 1000 user
45
+ ENV APP_HOME=/home/user
46
 
47
+ # Set env variables for Gradio & other apps
48
+ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
49
+ TLDEXTRACT_CACHE=/tmp/tld/ \
50
+ MPLCONFIGDIR=/tmp/matplotlib_cache/ \
51
+ GRADIO_OUTPUT_FOLDER=$APP_HOME/app/output/ \
52
+ GRADIO_INPUT_FOLDER=$APP_HOME/app/input/ \
53
+ FEEDBACK_LOGS_FOLDER=$APP_HOME/app/feedback/ \
54
+ ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
55
+ USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
56
+ CONFIG_FOLDER=$APP_HOME/app/config/ \
57
+ XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
58
+
59
+ # Create the base application directory and set its ownership
60
+ RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
61
+
62
+ # Create required sub-folders within the app directory and set their permissions
63
+ # This ensures these specific directories are owned by 'user'
64
+ RUN mkdir -p \
65
+ ${APP_HOME}/app/output \
66
+ ${APP_HOME}/app/input \
67
+ ${APP_HOME}/app/logs \
68
+ ${APP_HOME}/app/usage \
69
+ ${APP_HOME}/app/feedback \
70
+ ${APP_HOME}/app/config \
71
+ && chown user:user \
72
+ ${APP_HOME}/app/output \
73
+ ${APP_HOME}/app/input \
74
+ ${APP_HOME}/app/logs \
75
+ ${APP_HOME}/app/usage \
76
+ ${APP_HOME}/app/feedback \
77
+ ${APP_HOME}/app/config \
78
+ && chmod 755 \
79
+ ${APP_HOME}/app/output \
80
+ ${APP_HOME}/app/input \
81
+ ${APP_HOME}/app/logs \
82
+ ${APP_HOME}/app/usage \
83
+ ${APP_HOME}/app/feedback \
84
+ ${APP_HOME}/app/config
85
+
86
+ # Now handle the /tmp and /var/tmp directories and their subdirectories
87
+ RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
88
+ && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
89
+ && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
90
+ && chmod 700 ${XDG_CACHE_HOME}
91
 
92
  # Copy installed packages from builder stage
93
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
94
 
95
+ # Copy app code and entrypoint with correct ownership
96
+ COPY --chown=user . $APP_HOME/app
97
 
98
+ # Copy and chmod entrypoint
99
  COPY entrypoint.sh /entrypoint.sh
 
100
  RUN chmod +x /entrypoint.sh
101
 
102
+ # Switch to user
103
  USER user
104
 
105
+ # Declare working directory
106
+ WORKDIR $APP_HOME/app
107
 
108
+ # Declare volumes (NOTE: runtime mounts will override permissions — handle with care)
109
+ VOLUME ["/tmp/matplotlib_cache"]
110
+ VOLUME ["/tmp/gradio_tmp"]
111
+ VOLUME ["/tmp/tld"]
112
+ VOLUME ["/home/user/app/output"]
113
+ VOLUME ["/home/user/app/input"]
114
+ VOLUME ["/home/user/app/logs"]
115
+ VOLUME ["/home/user/app/usage"]
116
+ VOLUME ["/home/user/app/feedback"]
117
+ VOLUME ["/home/user/app/config"]
118
+ VOLUME ["/tmp"]
119
+ VOLUME ["/var/tmp"]
120
+
121
+ # Set runtime environment
122
  ENV PATH=$APP_HOME/.local/bin:$PATH \
123
  PYTHONPATH=$APP_HOME/app \
124
  PYTHONUNBUFFERED=1 \
 
127
  GRADIO_NUM_PORTS=1 \
128
  GRADIO_SERVER_NAME=0.0.0.0 \
129
  GRADIO_SERVER_PORT=7860 \
130
+ GRADIO_ANALYTICS_ENABLED=False
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ ENTRYPOINT ["/entrypoint.sh"]
133
 
134
+ CMD ["lambda_entrypoint.lambda_handler"]
 
README.md CHANGED
@@ -5,16 +5,16 @@ colorFrom: blue
5
  colorTo: yellow
6
  sdk: docker
7
  app_file: app.py
8
- pinned: false
9
  license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 0.6.8
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
17
- To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
18
 
19
  After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
20
 
@@ -181,6 +181,8 @@ If the table is empty, you can add a new entry, you can add a new row by clickin
181
 
182
  ![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
183
 
 
 
184
  ### Redacting additional types of personal information
185
 
186
  You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
@@ -390,21 +392,49 @@ You can find this option at the bottom of the 'Redaction Settings' tab. Upload m
390
 
391
  The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
392
 
393
- Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature calculates the similarity of text in all pages of input PDFs, calculates a similarity score, and then flags pages above a certain similarity score (90%) for removal by creating a 'whole page' redaction list file for each input PDF.
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- ![Example duplicate page outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface.PNG)
 
 
 
 
 
 
396
 
397
- The similarity calculation is based on using the 'ocr_outputs.csv' file that is output every time that you perform a redaction task. From the file folder, upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
398
 
399
- ![Identify duplicate pages interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface.PNG)
 
400
 
401
- First, there is a 'combined_ocr_result...' file that just merges together all the text from the input files. 'page_similarity_results.csv' shows a breakdown of the pages from each file that are most similar to each other above the threshold (90% similarity). You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'.
 
 
 
 
 
 
 
 
 
 
402
 
403
  ![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
404
 
405
- The remaining output files are suffixed with '_whole_page.csv'. These are the same files that can be used to redact whole pages as described in the ['Full page redaction list example' section](#full-page-redaction-list-example). For each PDF involved in the duplicate detection process, you can upload the relevant '_whole_page.csv' file into the relevant area, then do a new redaction task for the PDF file without any entity types selected. This way, only the suggested whole pages will be suggested for redaction and nothing else.
406
 
407
- ![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/output_file_2_whole_page_outputs.PNG)
408
 
409
  If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
410
 
@@ -505,6 +535,8 @@ Again, a lot can potentially go wrong with AWS solutions that are insecure, so b
505
 
506
  ## Modifying existing redaction review files
507
 
 
 
508
  You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
509
 
510
  As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified outside of the app, and also merged with others from multiple redaction attempts on the same file. This gives you the flexibility to change redaction details outside of the app.
 
5
  colorTo: yellow
6
  sdk: docker
7
  app_file: app.py
8
+ pinned: true
9
  license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
+ version: 0.7.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
17
+ To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
18
 
19
  After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
20
 
 
181
 
182
  ![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
183
 
184
+ **Note:** As of version 0.7.0 you can now apply your whole page redaction list directly to the document file currently under review by clicking the 'Apply whole page redaction list to document currently under review' button that appears here.
185
+
186
  ### Redacting additional types of personal information
187
 
188
  You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
 
392
 
393
  The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
394
 
395
+ Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents. It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
396
+
397
+ ![Example duplicate page inputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface_new.PNG)
398
+
399
+ **Step 1: Upload and Configure the Analysis**
400
+ First, navigate to the "Identify duplicate pages" tab. Upload all the ocr_output.csv files you wish to compare into the file area. These files are generated every time you run a redaction task and contain the text for each page of a document.
401
+
402
+ For our example, you can upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
403
+
404
+ The default options will search for matching subdocuments of any length. Before running the analysis, you can configure these matching parameters to tell the tool what you're looking for:
405
+
406
+ ![Duplicate matching parameters](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_matching_parameters.PNG)
407
 
408
+ *Matching Parameters*
409
+ - **Similarity Threshold:** A score from 0 to 1. Pages or sequences of pages with a calculated text similarity above this value will be considered a match. The default of 0.9 (90%) is a good starting point for finding near-identical pages.
410
+ - **Min Word Count:** Pages with fewer words than this value will be completely ignored during the comparison. This is extremely useful for filtering out blank pages, title pages, or boilerplate pages that might otherwise create noise in the results. The default is 10.
411
+ - **Choosing a Matching Strategy:** You have three main options to find duplicate content.
412
+ - *'Subdocument' matching (default):* Use this to find the longest possible sequence of matching pages. The tool will find an initial match and then automatically expand it forward page-by-page until the consecutive match breaks. This is the best method for identifying complete copied chapters or sections of unknown length. This is enabled by default by ticking the "Enable 'subdocument' matching" box. This setting overrides the described below.
413
+ - *Minimum length subdocument matching:* Use this to find sequences of consecutively matching pages with a minimum page lenght. For example, setting the slider to 3 will only return sections that are at least 3 pages long. How to enable: Untick the "Enable 'subdocument' matching" box and set the "Minimum consecutive pages" slider to a value greater than 1.
414
+ - *Single Page Matching:* Use this to find all individual page pairs that are similar to each other. Leave the "Enable 'subdocument' matching" box unchecked and keep the "Minimum consecutive pages" slider at 1.
415
 
416
+ Once your parameters are set, click the "Identify duplicate pages/subdocuments" button.
417
 
418
+ **Step 2: Review Results in the Interface**
419
+ After the analysis is complete, the results will be displayed directly in the interface.
420
 
421
+ *Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
422
+
423
+ *Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
424
+
425
+ ![Duplicate review interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_review_overview.PNG)
426
+
427
+ **Step 3: Download and Use the Output Files**
428
+ The analysis also generates a set of downloadable files for your records and for performing redactions.
429
+
430
+
431
+ - page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
432
 
433
  ![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
434
 
435
+ - [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
436
 
437
+ ![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface_new.PNG)
438
 
439
  If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
440
 
 
535
 
536
  ## Modifying existing redaction review files
537
 
538
+ *Note:* As of version 0.7.0 you can now modify redaction review files directly in the app on the 'Review redactions' tab. Open the accordion 'View and edit review data' under the file input area. You can edit review file data cells here - press Enter to apply changes. You should see the effect on the current page if you click the 'Save changes on current page to file' button to the right.
539
+
540
  You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
541
 
542
  As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified outside of the app, and also merged with others from multiple redaction attempts on the same file. This gives you the flexibility to change redaction details outside of the app.
_quarto.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project:
2
+ type: website
3
+ output-dir: docs # Common for GitHub Pages
4
+ render:
5
+ - "*.qmd"
6
+
7
+ website:
8
+ title: "Document Redaction App"
9
+ page-navigation: true # Often enabled for floating TOC to highlight current section
10
+ back-to-top-navigation: true
11
+ search: true
12
+ navbar:
13
+ left:
14
+ - href: index.qmd
15
+ text: Home
16
+ - href: src/user_guide.qmd
17
+ text: User guide
18
+ - href: src/faq.qmd
19
+ text: User FAQ
20
+ - href: src/installation_guide.qmd
21
+ text: App installation guide (with CDK)
22
+ - href: src/app_settings.qmd
23
+ text: App settings management guide
24
+
25
+ format:
26
+ html:
27
+ theme: cosmo
28
+ css: styles.css
app.py CHANGED
@@ -3,37 +3,39 @@ import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
6
- from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
- from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
11
  from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
- from tools.find_duplicate_pages import identify_similar_pages
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
19
  pd.set_option('future.no_silent_downcasting', True)
20
 
21
  # Convert string environment variables to string or list
22
- SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
23
- SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
 
 
24
 
25
- if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = eval(CSV_ACCESS_LOG_HEADERS)
26
- if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = eval(CSV_FEEDBACK_LOG_HEADERS)
27
- if CSV_USAGE_LOG_HEADERS: CSV_USAGE_LOG_HEADERS = eval(CSV_USAGE_LOG_HEADERS)
28
 
29
- if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCESS_LOG_HEADERS)
30
- if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
31
- if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
32
 
33
- if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = eval(CHOSEN_COMPREHEND_ENTITIES)
34
- if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = eval(FULL_COMPREHEND_ENTITY_LIST)
35
- if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = eval(CHOSEN_REDACT_ENTITIES)
36
- if FULL_ENTITY_LIST: FULL_ENTITY_LIST = eval(FULL_ENTITY_LIST)
37
 
38
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
39
  CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
@@ -42,7 +44,7 @@ FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
42
  FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
43
 
44
  # Create the gradio interface
45
- app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
46
 
47
  with app:
48
 
@@ -55,7 +57,7 @@ with app:
55
  all_image_annotations_state = gr.State([])
56
 
57
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
58
- review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
59
 
60
  all_page_line_level_ocr_results = gr.State([])
61
  all_page_line_level_ocr_results_with_children = gr.State([])
@@ -186,6 +188,9 @@ with app:
186
  # Duplicate page detection
187
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
188
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
 
 
 
189
 
190
  # Tracking variables for current page (not visible)
191
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -230,7 +235,7 @@ with app:
230
 
231
  Redact personally identifiable information (PII) from documents (PDF, images), open text, or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
232
 
233
- To identify text in documents, the 'Local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
234
 
235
  After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...review_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
236
 
@@ -259,9 +264,9 @@ with app:
259
  local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
260
  with gr.Column(scale=4):
261
  with gr.Row(equal_height=True):
262
- total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
263
- estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
264
- estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
265
 
266
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
267
  with gr.Accordion("Assign task to cost code", open = True, visible=True):
@@ -317,7 +322,10 @@ with app:
317
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
318
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
319
  with gr.Row():
320
- clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
 
 
 
321
 
322
  with gr.Row():
323
  with gr.Column(scale=2):
@@ -376,7 +384,8 @@ with app:
376
 
377
  with gr.Accordion("Search all extracted text", open=True):
378
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
379
- reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
 
380
 
381
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
382
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -387,13 +396,67 @@ with app:
387
  # IDENTIFY DUPLICATE PAGES TAB
388
  ###
389
  with gr.Tab(label="Identify duplicate pages"):
390
- with gr.Accordion("Identify duplicate pages to redact", open = True):
391
- in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  with gr.Row():
393
- duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
394
- find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
 
 
 
 
 
 
 
 
395
 
396
- duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
 
 
 
 
 
 
 
 
 
 
397
 
398
  ###
399
  # TEXT / TABULAR DATA TAB
@@ -448,6 +511,13 @@ with app:
448
  in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
449
  in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
450
  in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
 
 
 
 
 
 
 
451
 
452
  with gr.Accordion("Select entity types to redact", open = True):
453
  in_redact_entities = gr.Dropdown(value=CHOSEN_REDACT_ENTITIES, choices=FULL_ENTITY_LIST, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
@@ -517,24 +587,24 @@ with app:
517
  cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
518
 
519
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
520
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
521
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
522
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
523
 
524
  # Run redaction function
525
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
526
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
527
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
528
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc")
529
 
530
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
531
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
532
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
533
 
534
  # If a file has been completed, the function will continue onto the next document
535
- latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
536
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
537
- success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
538
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
539
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
540
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
@@ -556,62 +626,67 @@ with app:
556
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
557
 
558
  convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
559
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
560
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
561
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
562
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
563
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
564
- success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
565
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
566
- success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
567
 
568
  ###
569
  # REVIEW PDF REDACTIONS
570
  ###
 
571
 
572
  # Upload previous files for modifying redactions
573
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
574
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
575
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
576
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
 
 
 
 
577
 
578
  # Page number controls
579
  annotate_current_page.submit(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
580
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
581
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
582
 
583
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
584
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
585
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
586
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
587
 
588
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
589
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
590
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
591
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
592
 
593
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
594
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
595
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
596
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
597
 
598
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
599
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
600
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
601
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
602
 
603
  annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page]).\
604
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
605
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
606
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
607
 
608
  # Apply page redactions
609
- annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
610
 
611
  # Save current page redactions
612
  update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
613
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
614
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
615
 
616
  # Review table controls
617
  recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
@@ -620,50 +695,52 @@ with app:
620
 
621
  # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
622
  recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
623
- success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
624
- success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page])
 
625
 
626
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
627
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
628
 
629
  # Exclude current selection from annotator and outputs
630
  # Exclude only selected row
631
- exclude_selected_row_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
632
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
633
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
634
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
635
 
636
  # Exclude all items with same text as selected row
637
  exclude_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
638
- success(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
639
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
640
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
641
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
642
 
643
  # Exclude everything visible in table
644
- exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
645
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
646
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state]).\
647
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
648
 
649
-
650
-
651
- undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
652
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
653
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
654
 
655
  # Review OCR text button
656
- all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row])
 
 
 
657
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
658
 
659
  # Convert review file to xfdf Adobe format
660
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
661
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
662
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
663
 
664
  # Convert xfdf Adobe file back to review_file.csv
665
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
666
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
667
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
668
 
669
  ###
@@ -676,7 +753,7 @@ with app:
676
  success(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data").\
677
  success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
678
 
679
- # Currently only supports redacting one data file at a time
680
  # If the output file count text box changes, keep going with redacting each data file until done
681
  # text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
682
  # success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
@@ -684,7 +761,42 @@ with app:
684
  ###
685
  # IDENTIFY DUPLICATE PAGES
686
  ###
687
- find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages, duplicate_threshold_value, output_folder_textbox], outputs=[duplicate_pages_df, duplicate_pages_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
 
689
  ###
690
  # SETTINGS PAGE INPUT / OUTPUT
@@ -699,6 +811,13 @@ with app:
699
  in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
700
  in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
701
 
 
 
 
 
 
 
 
702
  # Merge multiple review csv files together
703
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
704
 
 
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
6
+ from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
+ from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection
11
  from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
+ from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
19
  pd.set_option('future.no_silent_downcasting', True)
20
 
21
  # Convert string environment variables to string or list
22
+ if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
23
+ else: SAVE_LOGS_TO_CSV = False
24
+ if SAVE_LOGS_TO_DYNAMODB == "True": SAVE_LOGS_TO_DYNAMODB = True
25
+ else: SAVE_LOGS_TO_DYNAMODB = False
26
 
27
+ if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
28
+ if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
29
+ if CSV_USAGE_LOG_HEADERS: CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
30
 
31
+ if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
32
+ if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
33
+ if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
34
 
35
+ if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
36
+ if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
37
+ if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
38
+ if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
39
 
40
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
41
  CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
 
44
  FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
45
 
46
  # Create the gradio interface
47
+ app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True) #gr.themes.Base()
48
 
49
  with app:
50
 
 
57
  all_image_annotations_state = gr.State([])
58
 
59
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
60
+
61
 
62
  all_page_line_level_ocr_results = gr.State([])
63
  all_page_line_level_ocr_results_with_children = gr.State([])
 
188
  # Duplicate page detection
189
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
190
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
191
+ full_duplicated_data_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="full_duplicated_data_df", visible=False, type="pandas", wrap=True)
192
+ selected_duplicate_data_row_index = gr.Number(value=None, label="selected_duplicate_data_row_index", visible=False)
193
+ full_duplicate_data_by_file = gr.State() # A dictionary of the full duplicate data indexed by file
194
 
195
  # Tracking variables for current page (not visible)
196
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
 
235
 
236
  Redact personally identifiable information (PII) from documents (PDF, images), open text, or tabular data (XLSX/CSV/Parquet). Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use the app. Below is a very brief overview.
237
 
238
+ To identify text in documents, the 'Local' text/OCR image analysis uses spaCy/Tesseract, and works well only for documents with typed text. If available, choose 'AWS Textract' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
239
 
240
  After redaction, review suggested redactions on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...review_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
241
 
 
264
  local_ocr_output_found_checkbox = gr.Checkbox(value= False, label="Existing local OCR output file found", interactive=False, visible=True)
265
  with gr.Column(scale=4):
266
  with gr.Row(equal_height=True):
267
+ total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True, interactive=False)
268
+ estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True, interactive=False)
269
+ estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2, interactive=False)
270
 
271
  if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
272
  with gr.Accordion("Assign task to cost code", open = True, visible=True):
 
322
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
323
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
324
  with gr.Row():
325
+ clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
326
+
327
+ with gr.Accordion(label = "View and edit review file data", open=False):
328
+ review_file_df = gr.Dataframe(value=pd.DataFrame(), headers=['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'], row_count = (0, "dynamic"), label="Review file data", visible=True, type="pandas", wrap=True, show_search=True, show_fullscreen_button=True, show_copy_button=True)
329
 
330
  with gr.Row():
331
  with gr.Column(scale=2):
 
384
 
385
  with gr.Accordion("Search all extracted text", open=True):
386
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
387
+ reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
388
+ selected_ocr_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "text":[]}), col_count=2, type="pandas", visible=False, headers=["page", "text"], wrap=True)
389
 
390
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
391
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
 
396
  # IDENTIFY DUPLICATE PAGES TAB
397
  ###
398
  with gr.Tab(label="Identify duplicate pages"):
399
+ gr.Markdown("Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab.")
400
+
401
+ with gr.Accordion("Step 1: Configure and run analysis", open = True):
402
+ in_duplicate_pages = gr.File(
403
+ label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
404
+ file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv']
405
+ )
406
+
407
+ with gr.Accordion("Duplicate matching parameters", open = False):
408
+ with gr.Row():
409
+ duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
410
+ min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
411
+
412
+ gr.Markdown("#### Matching Strategy")
413
+ greedy_match_input = gr.Checkbox(
414
+ label="Enable 'subdocument' matching",
415
+ value=True,
416
+ info="If checked, finds the longest possible sequence of matching pages (subdocuments), minimum length one page. Overrides the slider below."
417
+ )
418
+ min_consecutive_pages_input = gr.Slider(
419
+ minimum=1, maximum=20, value=1, step=1,
420
+ label="Minimum consecutive pages (modified subdocument match)",
421
+ info="If greedy matching option above is unticked, use this to find only subdocuments of a minimum number of consecutive pages."
422
+ )
423
+
424
+ find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages/subdocuments", variant="primary")
425
+
426
+ with gr.Accordion("Step 2: Review and refine results", open=True):
427
+ gr.Markdown("### Analysis summary\nClick on a row to select it for preview or exclusion.")
428
+
429
+ with gr.Row():
430
+ results_df_preview = gr.Dataframe(
431
+ label="Similarity Results",
432
+ wrap=True,
433
+ show_fullscreen_button=True,
434
+ show_search=True,
435
+ show_copy_button=True
436
+ )
437
  with gr.Row():
438
+ exclude_match_btn = gr.Button(
439
+ value=" Exclude Selected Match",
440
+ variant="stop"
441
+ )
442
+ gr.Markdown("Click a row in the table, then click this button to remove it from the results and update the downloadable files.")
443
+
444
+ gr.Markdown("### Full Text Preview of Selected Match")
445
+ with gr.Row():
446
+ page1_text_preview = gr.Dataframe(label="Match Source (Document 1)", wrap=True, headers=["page", "text"], show_fullscreen_button=True, show_search=True, show_copy_button=True)
447
+ page2_text_preview = gr.Dataframe(label="Match Duplicate (Document 2)", wrap=True, headers=["page", "text"], show_fullscreen_button=True, show_search=True, show_copy_button=True)
448
 
449
+ gr.Markdown("### Downloadable Files")
450
+ duplicate_files_out = gr.File(
451
+ label="Download analysis summary and redaction lists (.csv)",
452
+ file_count="multiple",
453
+ height=FILE_INPUT_HEIGHT
454
+ )
455
+
456
+ with gr.Row():
457
+ apply_match_btn = gr.Button(
458
+ value="Apply relevant duplicate page output to document currently under review",
459
+ variant="secondary")
460
 
461
  ###
462
  # TEXT / TABULAR DATA TAB
 
511
  in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["allow_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Allow list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
512
  in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["deny_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Deny list", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, wrap=True)
513
  in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
514
+ with gr.Row():
515
+ with gr.Column(scale=2):
516
+ markdown_placeholder = gr.Markdown("")
517
+ with gr.Column(scale=1):
518
+ apply_fully_redacted_list_btn = gr.Button(
519
+ value="Apply whole page redaction list to document currently under review",
520
+ variant="secondary")
521
 
522
  with gr.Accordion("Select entity types to redact", open = True):
523
  in_redact_entities = gr.Dropdown(value=CHOSEN_REDACT_ENTITIES, choices=FULL_ENTITY_LIST, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
 
587
  cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
588
 
589
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
590
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
591
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
592
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
593
 
594
  # Run redaction function
595
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
596
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
597
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
598
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc")
599
 
600
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
601
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
602
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
603
 
604
  # If a file has been completed, the function will continue onto the next document
605
+ latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
606
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
607
+ success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
608
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
609
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
610
  success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title]).\
 
626
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
627
 
628
  convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
629
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
630
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
631
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
632
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
633
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
634
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
635
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
636
+ success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
637
 
638
  ###
639
  # REVIEW PDF REDACTIONS
640
  ###
641
+
642
 
643
  # Upload previous files for modifying redactions
644
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
645
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
646
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
647
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
648
+
649
+ # Manual updates to review di
650
+ review_file_df.input(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
651
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
652
 
653
  # Page number controls
654
  annotate_current_page.submit(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
655
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
656
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
657
 
658
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
659
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
660
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
661
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
662
 
663
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
664
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
665
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
666
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
667
 
668
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
669
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
670
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
671
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
672
 
673
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
674
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
675
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
676
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
677
 
678
  annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page]).\
679
  success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
680
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
681
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
682
 
683
  # Apply page redactions
684
+ annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df], scroll_to_output=True)
685
 
686
  # Save current page redactions
687
  update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
688
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
689
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
690
 
691
  # Review table controls
692
  recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
 
695
 
696
  # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
697
  recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
698
+ success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
699
+ success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
700
+ success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
701
 
702
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
703
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
704
 
705
  # Exclude current selection from annotator and outputs
706
  # Exclude only selected row
707
+ exclude_selected_row_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
708
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
709
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
710
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
711
 
712
  # Exclude all items with same text as selected row
713
  exclude_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
714
+ success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
715
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
716
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
717
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
718
 
719
  # Exclude everything visible in table
720
+ exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
721
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
722
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
723
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
724
 
725
+ undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
726
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
727
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
 
 
728
 
729
  # Review OCR text button
730
+ all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_ocr_dataframe_row]).\
731
+ success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
732
+ success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
733
+
734
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
735
 
736
  # Convert review file to xfdf Adobe format
737
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
738
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
739
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
740
 
741
  # Convert xfdf Adobe file back to review_file.csv
742
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
743
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
744
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
745
 
746
  ###
 
753
  success(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number], api_name="redact_data").\
754
  success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
755
 
756
+ # Currently only supports redacting one data file at a time, following code block not used
757
  # If the output file count text box changes, keep going with redacting each data file until done
758
  # text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list_state, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox, actual_time_taken_number], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state, actual_time_taken_number]).\
759
  # success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
 
761
  ###
762
  # IDENTIFY DUPLICATE PAGES
763
  ###
764
+ find_duplicate_pages_btn.click(
765
+ fn=run_duplicate_analysis,
766
+ inputs=[
767
+ in_duplicate_pages,
768
+ duplicate_threshold_input,
769
+ min_word_count_input,
770
+ min_consecutive_pages_input,
771
+ greedy_match_input
772
+ ],
773
+ outputs=[
774
+ results_df_preview,
775
+ duplicate_files_out,
776
+ full_duplicate_data_by_file
777
+ ]
778
+ )
779
+
780
+ # full_duplicated_data_df,
781
+ results_df_preview.select(
782
+ fn=handle_selection_and_preview,
783
+ inputs=[results_df_preview, full_duplicate_data_by_file],
784
+ outputs=[selected_duplicate_data_row_index, page1_text_preview, page2_text_preview]
785
+ )
786
+
787
+ # When the user clicks the "Exclude" button
788
+ exclude_match_btn.click(
789
+ fn=exclude_match,
790
+ inputs=[results_df_preview, selected_duplicate_data_row_index],
791
+ outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
792
+ )
793
+
794
+ apply_match_btn.click(
795
+ fn=apply_whole_page_redactions_from_list,
796
+ inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
797
+ outputs=[review_file_df, all_image_annotations_state]).\
798
+ success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
799
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
800
 
801
  ###
802
  # SETTINGS PAGE INPUT / OUTPUT
 
811
  in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
812
  in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
813
 
814
+ apply_fully_redacted_list_btn.click(
815
+ fn=apply_whole_page_redactions_from_list,
816
+ inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
817
+ outputs=[review_file_df, all_image_annotations_state]).\
818
+ success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
819
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
820
+
821
  # Merge multiple review csv files together
822
  merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
823
 
cdk/__init__.py ADDED
File without changes
cdk/app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from aws_cdk import (App, Environment)
3
+
4
+ # Assuming these are still relevant for you
5
+ from check_resources import check_and_set_context, CONTEXT_FILE
6
+ from cdk_config import AWS_ACCOUNT_ID, AWS_REGION, RUN_USEAST_STACK, USE_CLOUDFRONT
7
+ from cdk_stack import CdkStack, CdkStackCloudfront#, CdkStackMain
8
+ from cdk_functions import load_context_from_file, create_basic_config_env
9
+
10
+ # Initialize the CDK app
11
+ app = App()
12
+
13
+ # --- ENHANCED CONTEXT GENERATION AND LOADING ---
14
+ # 1. Always ensure the old context file is removed before generation
15
+ if os.path.exists(CONTEXT_FILE):
16
+ try:
17
+ os.remove(CONTEXT_FILE)
18
+ print(f"Removed stale context file: {CONTEXT_FILE}")
19
+ except OSError as e:
20
+ print(f"Warning: Could not remove old context file {CONTEXT_FILE}: {e}")
21
+ # Proceed anyway, check_and_set_context might handle overwriting
22
+
23
+ # 2. Always run the pre-check script to generate fresh context
24
+ print("Running pre-check script to generate application context...")
25
+ try:
26
+ check_and_set_context()
27
+ if not os.path.exists(CONTEXT_FILE):
28
+ raise RuntimeError(f"check_and_set_context() finished, but {CONTEXT_FILE} was not created.")
29
+ print(f"Context generated successfully at {CONTEXT_FILE}.")
30
+ except Exception as e:
31
+ raise RuntimeError(f"Failed to generate context via check_and_set_context(): {e}")
32
+
33
+ if os.path.exists(CONTEXT_FILE):
34
+ load_context_from_file(app, CONTEXT_FILE)
35
+ else:
36
+ raise RuntimeError(f"Could not find {CONTEXT_FILE}.")
37
+
38
+ # Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
39
+ create_basic_config_env("config")
40
+
41
+ # Define the environment for the regional stack (where ALB resides)
42
+ aws_env_regional = Environment(account=AWS_ACCOUNT_ID, region=AWS_REGION)
43
+
44
+ # Create the regional stack (ALB, SGs, etc.)
45
+ # regional_stack = CdkStack(app,
46
+ # "RedactionStackSubnets",
47
+ # env=aws_env_regional,
48
+ # cross_region_references=True)
49
+
50
+ # regional_stack_main = CdkStackMain(app,
51
+ # "RedactionStackMain",
52
+ # env=aws_env_regional,
53
+ # private_subnets=regional_stack.params["private_subnets"],
54
+ # private_route_tables=regional_stack.params["private_route_tables"],
55
+ # public_subnets=regional_stack.params["public_subnets"],
56
+ # public_route_tables=regional_stack.params["public_route_tables"],
57
+ # cross_region_references=True)
58
+
59
+ regional_stack = CdkStack(app,
60
+ "RedactionStack",
61
+ env=aws_env_regional,
62
+ cross_region_references=True)
63
+
64
+ if USE_CLOUDFRONT == 'True' and RUN_USEAST_STACK == 'True':
65
+ # Define the environment for the CloudFront stack (always us-east-1 for CF-level resources like WAFv2 WebACLs for CF)
66
+ aws_env_us_east_1 = Environment(account=AWS_ACCOUNT_ID, region="us-east-1")
67
+
68
+ # Create the CloudFront stack, passing the outputs from the regional stack
69
+ cloudfront_stack = CdkStackCloudfront(
70
+ app,
71
+ "RedactionStackCloudfront",
72
+ env=aws_env_us_east_1,
73
+ alb_arn=regional_stack.params["alb_arn_output"],
74
+ alb_sec_group_id=regional_stack.params["alb_security_group_id"],
75
+ alb_dns_name=regional_stack.params["alb_dns_name"],
76
+ cross_region_references=True
77
+ )
78
+
79
+
80
+ # Synthesize the CloudFormation template
81
+ app.synth(validate_on_synthesis=True)
cdk/cdk_config.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from dotenv import load_dotenv
4
+
5
+ # Set or retrieve configuration variables for CDK redaction deployment
6
+
7
+ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
8
+ '''
9
+ Get an environmental variable, and set it to a default value if it doesn't exist
10
+ '''
11
+ # Get the environment variable if it exists
12
+ value = os.environ.get(var_name)
13
+
14
+ # If it doesn't exist, set the environment variable to the default value
15
+ if value is None:
16
+ os.environ[var_name] = default_value
17
+ value = default_value
18
+
19
+ if print_val == True:
20
+ print(f'The value of {var_name} is {value}')
21
+
22
+ return value
23
+
24
+ def ensure_folder_exists(output_folder:str):
25
+ """Checks if the specified folder exists, creates it if not."""
26
+
27
+ if not os.path.exists(output_folder):
28
+ # Create the folder if it doesn't exist
29
+ os.makedirs(output_folder, exist_ok=True)
30
+ print(f"Created the {output_folder} folder.")
31
+ else:
32
+ print(f"The {output_folder} folder already exists.")
33
+
34
+ def add_folder_to_path(folder_path: str):
35
+ '''
36
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
37
+ '''
38
+
39
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
40
+ print(folder_path, "folder exists.")
41
+
42
+ # Resolve relative path to absolute path
43
+ absolute_path = os.path.abspath(folder_path)
44
+
45
+ current_path = os.environ['PATH']
46
+ if absolute_path not in current_path.split(os.pathsep):
47
+ full_path_extension = absolute_path + os.pathsep + current_path
48
+ os.environ['PATH'] = full_path_extension
49
+ #print(f"Updated PATH with: ", full_path_extension)
50
+ else:
51
+ print(f"Directory {folder_path} already exists in PATH.")
52
+ else:
53
+ print(f"Folder not found at {folder_path} - not added to PATH")
54
+
55
+ ###
56
+ # LOAD CONFIG FROM ENV FILE
57
+ ###
58
+ CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', "config/")
59
+
60
+ ensure_folder_exists(CONFIG_FOLDER)
61
+
62
+ # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/cdk_config.env'
63
+ CDK_CONFIG_PATH = get_or_create_env_var('CDK_CONFIG_PATH', 'config/cdk_config.env') # e.g. config/cdk_config.env
64
+
65
+ if CDK_CONFIG_PATH:
66
+ if os.path.exists(CDK_CONFIG_PATH):
67
+ print(f"Loading CDK variables from config file {CDK_CONFIG_PATH}")
68
+ load_dotenv(CDK_CONFIG_PATH)
69
+ else: print("CDK config file not found at location:", CDK_CONFIG_PATH)
70
+
71
+ ###
72
+ # AWS OPTIONS
73
+ ###
74
+ AWS_REGION = get_or_create_env_var('AWS_REGION', '')
75
+ AWS_ACCOUNT_ID = get_or_create_env_var('AWS_ACCOUNT_ID', '')
76
+
77
+ ###
78
+ # CDK OPTIONS
79
+ ###
80
+ CDK_PREFIX = get_or_create_env_var('CDK_PREFIX', '')
81
+ CONTEXT_FILE = get_or_create_env_var('CONTEXT_FILE', 'cdk.context.json') # Define the CDK output context file name
82
+ CDK_FOLDER = get_or_create_env_var('CDK_FOLDER', '') # FULL_PATH_TO_CDK_FOLDER_HERE (with forward slash)
83
+ RUN_USEAST_STACK = get_or_create_env_var('RUN_USEAST_STACK', 'False')
84
+
85
+ ### VPC
86
+ VPC_NAME = get_or_create_env_var('VPC_NAME', '')
87
+ EXISTING_IGW_ID = get_or_create_env_var('EXISTING_IGW_ID', '')
88
+ SINGLE_NAT_GATEWAY_ID = get_or_create_env_var('SINGLE_NAT_GATEWAY_ID', '')
89
+
90
+ ### SUBNETS / ROUTE TABLES / NAT GATEWAY
91
+ PUBLIC_SUBNETS_TO_USE = get_or_create_env_var('PUBLIC_SUBNETS_TO_USE', '') # e.g. ['PublicSubnet1', 'PublicSubnet2']
92
+ PUBLIC_SUBNET_CIDR_BLOCKS = get_or_create_env_var('PUBLIC_SUBNET_CIDR_BLOCKS', '') # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
93
+ PUBLIC_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var('PUBLIC_SUBNET_AVAILABILITY_ZONES', '') # e.g. ["eu-east-1b", "eu-east1b"]
94
+
95
+ PRIVATE_SUBNETS_TO_USE = get_or_create_env_var('PRIVATE_SUBNETS_TO_USE', '') # e.g. ['PrivateSubnet1', 'PrivateSubnet2']
96
+ PRIVATE_SUBNET_CIDR_BLOCKS = get_or_create_env_var('PRIVATE_SUBNET_CIDR_BLOCKS', '') # e.g. ["10.0.1.0/24", "10.0.2.0/24"]
97
+ PRIVATE_SUBNET_AVAILABILITY_ZONES = get_or_create_env_var('PRIVATE_SUBNET_AVAILABILITY_ZONES', '') # e.g. ["eu-east-1b", "eu-east1b"]
98
+
99
+ ROUTE_TABLE_BASE_NAME = get_or_create_env_var('ROUTE_TABLE_BASE_NAME', f'{CDK_PREFIX}PrivateRouteTable')
100
+ NAT_GATEWAY_EIP_NAME = get_or_create_env_var('NAT_GATEWAY_EIP_NAME', f"{CDK_PREFIX}NatGatewayEip")
101
+ NAT_GATEWAY_NAME = get_or_create_env_var('NAT_GATEWAY_NAME', f"{CDK_PREFIX}NatGateway")
102
+
103
+ # IAM roles
104
+ AWS_MANAGED_TASK_ROLES_LIST = get_or_create_env_var('AWS_MANAGED_TASK_ROLES_LIST', '["AmazonCognitoReadOnly", "service-role/AmazonECSTaskExecutionRolePolicy", "AmazonS3FullAccess", "AmazonTextractFullAccess", "ComprehendReadOnly", "AmazonDynamoDBFullAccess", "service-role/AWSAppSyncPushToCloudWatchLogs"]')
105
+ POLICY_FILE_LOCATIONS = get_or_create_env_var('POLICY_FILE_LOCATIONS', '') # e.g. '["config/sts_permissions.json"]'
106
+ POLICY_FILE_ARNS = get_or_create_env_var('POLICY_FILE_ARNS', '')
107
+
108
+ # GITHUB REPO
109
+ GITHUB_REPO_USERNAME = get_or_create_env_var('GITHUB_REPO_USERNAME', 'seanpedrick-case')
110
+ GITHUB_REPO_NAME = get_or_create_env_var('GITHUB_REPO_NAME', 'doc_redaction')
111
+ GITHUB_REPO_BRANCH = get_or_create_env_var('GITHUB_REPO_BRANCH', 'main')
112
+
113
+ ### CODEBUILD
114
+ CODEBUILD_ROLE_NAME = get_or_create_env_var('CODEBUILD_ROLE_NAME', f"{CDK_PREFIX}CodeBuildRole")
115
+ CODEBUILD_PROJECT_NAME = get_or_create_env_var('CODEBUILD_PROJECT_NAME', f"{CDK_PREFIX}CodeBuildProject")
116
+
117
+ ### ECR
118
+ ECR_REPO_NAME = get_or_create_env_var('ECR_REPO_NAME', 'doc-redaction') # Beware - cannot have underscores and must be lower case
119
+ ECR_CDK_REPO_NAME = get_or_create_env_var('ECR_CDK_REPO_NAME', f"{CDK_PREFIX}{ECR_REPO_NAME}".lower())
120
+
121
+ ### S3
122
+ S3_LOG_CONFIG_BUCKET_NAME = get_or_create_env_var('S3_LOG_CONFIG_BUCKET_NAME', f"{CDK_PREFIX}s3-logs".lower()) # S3 bucket names need to be lower case
123
+ S3_OUTPUT_BUCKET_NAME = get_or_create_env_var('S3_OUTPUT_BUCKET_NAME', f"{CDK_PREFIX}s3-output".lower())
124
+
125
+ ### ECS
126
+ FARGATE_TASK_DEFINITION_NAME = get_or_create_env_var('FARGATE_TASK_DEFINITION_NAME', f"{CDK_PREFIX}FargateTaskDefinition")
127
+ TASK_DEFINITION_FILE_LOCATION = get_or_create_env_var('TASK_DEFINITION_FILE_LOCATION', CDK_FOLDER + CONFIG_FOLDER + "task_definition.json")
128
+
129
+ CLUSTER_NAME = get_or_create_env_var('CLUSTER_NAME', f"{CDK_PREFIX}Cluster")
130
+ ECS_SERVICE_NAME = get_or_create_env_var('ECS_SERVICE_NAME', f"{CDK_PREFIX}ECSService")
131
+ ECS_TASK_ROLE_NAME = get_or_create_env_var('ECS_TASK_ROLE_NAME', f"{CDK_PREFIX}TaskRole")
132
+ ECS_TASK_EXECUTION_ROLE_NAME = get_or_create_env_var('ECS_TASK_EXECUTION_ROLE_NAME', f"{CDK_PREFIX}ExecutionRole")
133
+ ECS_SECURITY_GROUP_NAME = get_or_create_env_var('ECS_SECURITY_GROUP_NAME', f"{CDK_PREFIX}SecurityGroupECS")
134
+ ECS_LOG_GROUP_NAME = get_or_create_env_var('ECS_LOG_GROUP_NAME', f"/ecs/{ECS_SERVICE_NAME}-logs".lower())
135
+
136
+ ECS_TASK_CPU_SIZE = get_or_create_env_var('ECS_TASK_CPU_SIZE', '1024')
137
+ ECS_TASK_MEMORY_SIZE = get_or_create_env_var('ECS_TASK_MEMORY_SIZE', '4096')
138
+ ECS_USE_FARGATE_SPOT = get_or_create_env_var('USE_FARGATE_SPOT', 'False')
139
+ ECS_READ_ONLY_FILE_SYSTEM = get_or_create_env_var('ECS_READ_ONLY_FILE_SYSTEM', 'True')
140
+
141
+ ### Cognito
142
+ COGNITO_USER_POOL_NAME = get_or_create_env_var('COGNITO_USER_POOL_NAME', f"{CDK_PREFIX}UserPool")
143
+ COGNITO_USER_POOL_CLIENT_NAME = get_or_create_env_var('COGNITO_USER_POOL_CLIENT_NAME', f"{CDK_PREFIX}UserPoolClient")
144
+ COGNITO_USER_POOL_CLIENT_SECRET_NAME = get_or_create_env_var('COGNITO_USER_POOL_CLIENT_SECRET_NAME', f"{CDK_PREFIX}ParamCognitoSecret")
145
+ COGNITO_USER_POOL_DOMAIN_PREFIX = get_or_create_env_var('COGNITO_USER_POOL_DOMAIN_PREFIX', "redaction-app-domain") # Should change this to something unique or you'll probably hit an error
146
+
147
+ # Application load balancer
148
+ ALB_NAME = get_or_create_env_var('ALB_NAME', f"{CDK_PREFIX}Alb"[-32:]) # Application load balancer name can be max 32 characters, so taking the last 32 characters of the suggested name
149
+ ALB_NAME_SECURITY_GROUP_NAME = get_or_create_env_var('ALB_SECURITY_GROUP_NAME', f"{CDK_PREFIX}SecurityGroupALB")
150
+ ALB_TARGET_GROUP_NAME = get_or_create_env_var('ALB_TARGET_GROUP_NAME', f"{CDK_PREFIX}-tg"[-32:]) # Max 32 characters
151
+ EXISTING_LOAD_BALANCER_ARN = get_or_create_env_var('EXISTING_LOAD_BALANCER_ARN', '')
152
+ EXISTING_LOAD_BALANCER_DNS = get_or_create_env_var('EXISTING_LOAD_BALANCER_ARN', 'placeholder_load_balancer_dns.net')
153
+
154
+ ## CLOUDFRONT
155
+ USE_CLOUDFRONT = get_or_create_env_var('USE_CLOUDFRONT', 'True')
156
+ CLOUDFRONT_PREFIX_LIST_ID = get_or_create_env_var('CLOUDFRONT_PREFIX_LIST_ID', 'pl-93a247fa')
157
+ CLOUDFRONT_GEO_RESTRICTION = get_or_create_env_var('CLOUDFRONT_GEO_RESTRICTION', '') # A country that Cloudfront restricts access to. See here: https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/georestrictions.html
158
+ CLOUDFRONT_DISTRIBUTION_NAME = get_or_create_env_var('CLOUDFRONT_DISTRIBUTION_NAME', f"{CDK_PREFIX}CfDist")
159
+ CLOUDFRONT_DOMAIN = get_or_create_env_var('CLOUDFRONT_DOMAIN', "cloudfront_placeholder.net")
160
+
161
+
162
+ # Certificate for Application load balancer (optional, for HTTPS and logins through the ALB)
163
+ ACM_CERTIFICATE_ARN = get_or_create_env_var('ACM_CERTIFICATE_ARN', '')
164
+ SSL_CERTIFICATE_DOMAIN = get_or_create_env_var('SSL_CERTIFICATE_DOMAIN', '') # e.g. example.com or www.example.com
165
+
166
+ # This should be the CloudFront domain, the domain linked to your ACM certificate, or the DNS of your application load balancer in console afterwards
167
+ if USE_CLOUDFRONT == "True":
168
+ COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + CLOUDFRONT_DOMAIN)
169
+ elif SSL_CERTIFICATE_DOMAIN:
170
+ COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + SSL_CERTIFICATE_DOMAIN)
171
+ else:
172
+ COGNITO_REDIRECTION_URL = get_or_create_env_var('COGNITO_REDIRECTION_URL', "https://" + EXISTING_LOAD_BALANCER_DNS)
173
+
174
+ # Custom headers e.g. if routing traffic through Cloudfront
175
+ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '') # Retrieving or setting CUSTOM_HEADER
176
+ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '') # Retrieving or setting CUSTOM_HEADER_VALUE
177
+
178
+ # Firewall on top of load balancer
179
+ LOAD_BALANCER_WEB_ACL_NAME = get_or_create_env_var('LOAD_BALANCER_WEB_ACL_NAME', f"{CDK_PREFIX}alb-web-acl")
180
+
181
+ # Firewall on top of CloudFront
182
+ WEB_ACL_NAME = get_or_create_env_var('WEB_ACL_NAME', f"{CDK_PREFIX}cloudfront-web-acl")
183
+
184
+ ###
185
+ # File I/O options
186
+ ###
187
+
188
+ OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
189
+ INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
190
+
191
+ # Allow for files to be saved in a temporary folder for increased security in some instances
192
+ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
193
+ # Create a temporary directory
194
+ with tempfile.TemporaryDirectory() as temp_dir:
195
+ print(f'Temporary directory created at: {temp_dir}')
196
+
197
+ if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
198
+ if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
199
+
200
+ ###
201
+ # LOGGING OPTIONS
202
+ ###
203
+
204
+ SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
205
+
206
+ ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
207
+ SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'True')
208
+ ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-access-log".lower())
209
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-feedback".lower())
210
+ USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-usage".lower())
211
+
212
+ ###
213
+ # REDACTION OPTIONS
214
+ ###
215
+
216
+ # Get some environment variables and Launch the Gradio app
217
+ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
218
+
219
+ GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
220
+
221
+ ###
222
+ # WHOLE DOCUMENT API OPTIONS
223
+ ###
224
+
225
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
cdk/cdk_functions.py ADDED
@@ -0,0 +1,1293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ from botocore.exceptions import ClientError
3
+ import json
4
+ import os
5
+ import pandas as pd
6
+ import ipaddress
7
+ from constructs import Construct
8
+ from dotenv import set_key
9
+ from typing import List, Tuple, Optional, Dict, Any
10
+ from aws_cdk import (
11
+ App,
12
+ CfnTag,
13
+ aws_ec2 as ec2,
14
+ aws_wafv2 as wafv2,
15
+ aws_elasticloadbalancingv2 as elb,
16
+ aws_elasticloadbalancingv2_actions as elb_act,
17
+ aws_certificatemanager as acm, # You might need this if you were looking up a cert, but not strictly for ARN
18
+ aws_cognito as cognito,
19
+ aws_iam as iam,
20
+ CfnOutput,
21
+ Tags
22
+ )
23
+
24
+
25
+
26
+ from cdk_config import PUBLIC_SUBNETS_TO_USE, PRIVATE_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNET_AVAILABILITY_ZONES, POLICY_FILE_LOCATIONS, NAT_GATEWAY_EIP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, AWS_REGION
27
+
28
+ # --- Function to load context from file ---
29
+ def load_context_from_file(app: App, file_path: str):
30
+ if os.path.exists(file_path):
31
+ with open(file_path, 'r') as f:
32
+ context_data = json.load(f)
33
+ for key, value in context_data.items():
34
+ app.node.set_context(key, value)
35
+ print(f"Loaded context from {file_path}")
36
+ else:
37
+ print(f"Context file not found: {file_path}")
38
+
39
+ # --- Helper to parse environment variables into lists ---
40
+ def _get_env_list(env_var_name: str) -> List[str]:
41
+ """Parses a comma-separated environment variable into a list of strings."""
42
+ value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
43
+ if not value:
44
+ return []
45
+ # Split by comma and filter out any empty strings that might result from extra commas
46
+ return [s.strip() for s in value.split(',') if s.strip()]
47
+
48
+ # 1. Try to load CIDR/AZs from environment variables
49
+ if PUBLIC_SUBNETS_TO_USE: PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
50
+ if PRIVATE_SUBNETS_TO_USE: PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
51
+
52
+ if PUBLIC_SUBNET_CIDR_BLOCKS: PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
53
+ if PUBLIC_SUBNET_AVAILABILITY_ZONES: PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
54
+ if PRIVATE_SUBNET_CIDR_BLOCKS: PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
55
+ if PRIVATE_SUBNET_AVAILABILITY_ZONES: PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list("PRIVATE_SUBNET_AVAILABILITY_ZONES")
56
+
57
+ if POLICY_FILE_LOCATIONS: POLICY_FILE_LOCATIONS = _get_env_list(POLICY_FILE_LOCATIONS)
58
+
59
+ def check_for_existing_role(role_name:str):
60
+ try:
61
+ iam = boto3.client('iam')
62
+ #iam.get_role(RoleName=role_name)
63
+
64
+ response = iam.get_role(RoleName=role_name)
65
+ role = response['Role']['Arn']
66
+
67
+ print("Response Role:", role)
68
+
69
+ return True, role, ""
70
+ except iam.exceptions.NoSuchEntityException:
71
+ return False, "", ""
72
+ except Exception as e:
73
+ raise Exception("Getting information on IAM role failed due to:", e)
74
+
75
+ import json
76
+ from typing import List, Dict, Any, Union, Optional
77
+ from aws_cdk import (
78
+ aws_iam as iam,
79
+ )
80
+ from constructs import Construct
81
+
82
+ # Assume POLICY_FILE_LOCATIONS is defined globally or passed as a default
83
+ # For example:
84
+ # POLICY_FILE_LOCATIONS = ["./policies/my_read_policy.json", "./policies/my_write_policy.json"]
85
+
86
+
87
+ def add_statement_to_policy(role: iam.IRole, policy_document: Dict[str, Any]):
88
+ """
89
+ Adds individual policy statements from a parsed policy document to a CDK Role.
90
+
91
+ Args:
92
+ role: The CDK Role construct to attach policies to.
93
+ policy_document: A Python dictionary representing an IAM policy document.
94
+ """
95
+ # Ensure the loaded JSON is a valid policy document structure
96
+ if 'Statement' not in policy_document or not isinstance(policy_document['Statement'], list):
97
+ print(f"Warning: Policy document does not contain a 'Statement' list. Skipping.")
98
+ return # Do not return role, just log and exit
99
+
100
+ for statement_dict in policy_document['Statement']:
101
+ try:
102
+ # Create a CDK PolicyStatement from the dictionary
103
+ cdk_policy_statement = iam.PolicyStatement.from_json(statement_dict)
104
+
105
+ # Add the policy statement to the role
106
+ role.add_to_policy(cdk_policy_statement)
107
+ print(f" - Added statement: {statement_dict.get('Sid', 'No Sid')}")
108
+ except Exception as e:
109
+ print(f"Warning: Could not process policy statement: {statement_dict}. Error: {e}")
110
+
111
+ def add_custom_policies(
112
+ scope: Construct, # Not strictly used here, but good practice if you expand to ManagedPolicies
113
+ role: iam.IRole,
114
+ policy_file_locations: Optional[List[str]] = None,
115
+ custom_policy_text: Optional[str] = None
116
+ ) -> iam.IRole:
117
+ """
118
+ Loads custom policies from JSON files or a string and attaches them to a CDK Role.
119
+
120
+ Args:
121
+ scope: The scope in which to define constructs (if needed, e.g., for iam.ManagedPolicy).
122
+ role: The CDK Role construct to attach policies to.
123
+ policy_file_locations: List of file paths to JSON policy documents.
124
+ custom_policy_text: A JSON string representing a policy document.
125
+
126
+ Returns:
127
+ The modified CDK Role construct.
128
+ """
129
+ if policy_file_locations is None:
130
+ policy_file_locations = []
131
+
132
+ current_source = "unknown source" # For error messages
133
+
134
+ try:
135
+ if policy_file_locations:
136
+ print(f"Attempting to add policies from files to role {role.node.id}...")
137
+ for path in policy_file_locations:
138
+ current_source = f"file: {path}"
139
+ try:
140
+ with open(path, 'r') as f:
141
+ policy_document = json.load(f)
142
+ print(f"Processing policy from {current_source}...")
143
+ add_statement_to_policy(role, policy_document)
144
+ except FileNotFoundError:
145
+ print(f"Warning: Policy file not found at {path}. Skipping.")
146
+ except json.JSONDecodeError as e:
147
+ print(f"Warning: Invalid JSON in policy file {path}: {e}. Skipping.")
148
+ except Exception as e:
149
+ print(f"An unexpected error occurred processing policy from {path}: {e}. Skipping.")
150
+
151
+ if custom_policy_text:
152
+ current_source = "custom policy text string"
153
+ print(f"Attempting to add policy from custom text to role {role.node.id}...")
154
+ try:
155
+ # *** FIX: Parse the JSON string into a Python dictionary ***
156
+ policy_document = json.loads(custom_policy_text)
157
+ print(f"Processing policy from {current_source}...")
158
+ add_statement_to_policy(role, policy_document)
159
+ except json.JSONDecodeError as e:
160
+ print(f"Warning: Invalid JSON in custom_policy_text: {e}. Skipping.")
161
+ except Exception as e:
162
+ print(f"An unexpected error occurred processing policy from custom_policy_text: {e}. Skipping.")
163
+
164
+ # You might want a final success message, but individual processing messages are also good.
165
+ print(f"Finished processing custom policies for role {role.node.id}.")
166
+
167
+ except Exception as e:
168
+ print(f"An unhandled error occurred during policy addition for {current_source}: {e}")
169
+
170
+ return role
171
+
172
+ # Import the S3 Bucket class if you intend to return a CDK object later
173
+ # from aws_cdk import aws_s3 as s3
174
+
175
+ def check_s3_bucket_exists(bucket_name: str): # Return type hint depends on what you return
176
+ """
177
+ Checks if an S3 bucket with the given name exists and is accessible.
178
+
179
+ Args:
180
+ bucket_name: The name of the S3 bucket to check.
181
+
182
+ Returns:
183
+ A tuple: (bool indicating existence, optional S3 Bucket object or None)
184
+ Note: Returning a Boto3 S3 Bucket object from here is NOT ideal
185
+ for direct use in CDK. You'll likely only need the boolean result
186
+ or the bucket name for CDK lookups/creations.
187
+ For this example, let's return the boolean and the name.
188
+ """
189
+ s3_client = boto3.client('s3')
190
+ try:
191
+ # Use head_bucket to check for existence and access
192
+ s3_client.head_bucket(Bucket=bucket_name)
193
+ print(f"Bucket '{bucket_name}' exists and is accessible.")
194
+ return True, bucket_name # Return True and the bucket name
195
+
196
+ except ClientError as e:
197
+ # If a ClientError occurs, check the error code.
198
+ # '404' means the bucket does not exist.
199
+ # '403' means the bucket exists but you don't have permission.
200
+ error_code = e.response['Error']['Code']
201
+ if error_code == '404':
202
+ print(f"Bucket '{bucket_name}' does not exist.")
203
+ return False, None
204
+ elif error_code == '403':
205
+ # The bucket exists, but you can't access it.
206
+ # Depending on your requirements, this might be treated as "exists"
207
+ # or "not accessible for our purpose". For checking existence,
208
+ # we'll say it exists here, but note the permission issue.
209
+ # NOTE - when I tested this, it was returning 403 even for buckets that don't exist. So I will return False instead
210
+ print(f"Bucket '{bucket_name}' returned 403, which indicates it may exist but is not accessible due to permissions, or that it doesn't exist. Returning False for existence just in case.")
211
+ return False, bucket_name # It exists, even if not accessible
212
+ else:
213
+ # For other errors, it's better to raise the exception
214
+ # to indicate something unexpected happened.
215
+ print(f"An unexpected AWS ClientError occurred checking bucket '{bucket_name}': {e}")
216
+ # Decide how to handle other errors - raising might be safer
217
+ raise # Re-raise the original exception
218
+ except Exception as e:
219
+ print(f"An unexpected non-ClientError occurred checking bucket '{bucket_name}': {e}")
220
+ # Decide how to handle other errors
221
+ raise # Re-raise the original exception
222
+
223
+ # Example usage in your check_resources.py:
224
+ # exists, bucket_name_if_exists = check_s3_bucket_exists(log_bucket_name)
225
+ # context_data[f"exists:{log_bucket_name}"] = exists
226
+ # # You don't necessarily need to store the name in context if using from_bucket_name
227
+
228
+ # Delete an S3 bucket
229
+ def delete_s3_bucket(bucket_name:str):
230
+ s3 = boto3.client('s3')
231
+
232
+ try:
233
+ # List and delete all objects
234
+ response = s3.list_object_versions(Bucket=bucket_name)
235
+ versions = response.get('Versions', []) + response.get('DeleteMarkers', [])
236
+ for version in versions:
237
+ s3.delete_object(Bucket=bucket_name, Key=version['Key'], VersionId=version['VersionId'])
238
+
239
+ # Delete the bucket
240
+ s3.delete_bucket(Bucket=bucket_name)
241
+ return {'Status': 'SUCCESS'}
242
+ except Exception as e:
243
+ return {'Status': 'FAILED', 'Reason': str(e)}
244
+
245
+ # Function to get subnet ID from subnet name
246
+ def get_subnet_id(vpc:str, ec2_client:str, subnet_name:str):
247
+ response = ec2_client.describe_subnets(Filters=[{'Name': 'vpc-id', 'Values': [vpc.vpc_id]}])
248
+
249
+ for subnet in response['Subnets']:
250
+ if subnet['Tags'] and any(tag['Key'] == 'Name' and tag['Value'] == subnet_name for tag in subnet['Tags']):
251
+ return subnet['SubnetId']
252
+
253
+ return None
254
+
255
+ def check_ecr_repo_exists(repo_name: str) -> tuple[bool, dict]:
256
+ """
257
+ Checks if an ECR repository with the given name exists.
258
+
259
+ Args:
260
+ repo_name: The name of the ECR repository to check.
261
+
262
+ Returns:
263
+ True if the repository exists, False otherwise.
264
+ """
265
+ ecr_client = boto3.client('ecr')
266
+ try:
267
+ print("ecr repo_name to check:", repo_name)
268
+ response = ecr_client.describe_repositories(repositoryNames=[repo_name])
269
+ # If describe_repositories succeeds and returns a list of repositories,
270
+ # and the list is not empty, the repository exists.
271
+ return len(response['repositories']) > 0, response['repositories'][0]
272
+ except ClientError as e:
273
+ # Check for the specific error code indicating the repository doesn't exist
274
+ if e.response['Error']['Code'] == 'RepositoryNotFoundException':
275
+ return False, {}
276
+ else:
277
+ # Re-raise other exceptions to handle unexpected errors
278
+ raise
279
+ except Exception as e:
280
+ print(f"An unexpected error occurred: {e}")
281
+ return False, {}
282
+
283
+ def check_codebuild_project_exists(project_name: str): # Adjust return type hint as needed
284
+ """
285
+ Checks if a CodeBuild project with the given name exists.
286
+
287
+ Args:
288
+ project_name: The name of the CodeBuild project to check.
289
+
290
+ Returns:
291
+ A tuple:
292
+ - The first element is True if the project exists, False otherwise.
293
+ - The second element is the project object (dictionary) if found,
294
+ None otherwise.
295
+ """
296
+ codebuild_client = boto3.client('codebuild')
297
+ try:
298
+ # Use batch_get_projects with a list containing the single project name
299
+ response = codebuild_client.batch_get_projects(names=[project_name])
300
+
301
+ # The response for batch_get_projects includes 'projects' (found)
302
+ # and 'projectsNotFound' (not found).
303
+ if response['projects']:
304
+ # If the project is found in the 'projects' list
305
+ print(f"CodeBuild project '{project_name}' found.")
306
+ return True, response['projects'][0]['arn'] # Return True and the project details dict
307
+ elif response['projectsNotFound'] and project_name in response['projectsNotFound']:
308
+ # If the project name is explicitly in the 'projectsNotFound' list
309
+ print(f"CodeBuild project '{project_name}' not found.")
310
+ return False, None
311
+ else:
312
+ # This case is less expected for a single name lookup,
313
+ # but could happen if there's an internal issue or the response
314
+ # structure is slightly different than expected for an error.
315
+ # It's safer to assume it wasn't found if not in 'projects'.
316
+ print(f"CodeBuild project '{project_name}' not found (not in 'projects' list).")
317
+ return False, None
318
+
319
+ except ClientError as e:
320
+ # Catch specific ClientErrors. batch_get_projects might not throw
321
+ # 'InvalidInputException' for a non-existent project name if the
322
+ # name format is valid. It typically just lists it in projectsNotFound.
323
+ # However, other ClientErrors are possible (e.g., permissions).
324
+ print(f"An AWS ClientError occurred checking CodeBuild project '{project_name}': {e}")
325
+ # Decide how to handle other ClientErrors - raising might be safer
326
+ raise # Re-raise the original exception
327
+ except Exception as e:
328
+ print(f"An unexpected non-ClientError occurred checking CodeBuild project '{project_name}': {e}")
329
+ # Decide how to handle other errors
330
+ raise # Re-raise the original exception
331
+
332
+ def get_vpc_id_by_name(vpc_name: str) -> Optional[str]:
333
+ """
334
+ Finds a VPC ID by its 'Name' tag.
335
+ """
336
+ ec2_client = boto3.client('ec2')
337
+ try:
338
+ response = ec2_client.describe_vpcs(
339
+ Filters=[
340
+ {'Name': 'tag:Name', 'Values': [vpc_name]}
341
+ ]
342
+ )
343
+ if response and response['Vpcs']:
344
+ vpc_id = response['Vpcs'][0]['VpcId']
345
+ print(f"VPC '{vpc_name}' found with ID: {vpc_id}")
346
+
347
+ # In get_vpc_id_by_name, after finding VPC ID:
348
+
349
+ # Look for NAT Gateways in this VPC
350
+ ec2_client = boto3.client('ec2')
351
+ nat_gateways = []
352
+ try:
353
+ response = ec2_client.describe_nat_gateways(
354
+ Filters=[
355
+ {'Name': 'vpc-id', 'Values': [vpc_id]},
356
+ # Optional: Add a tag filter if you consistently tag your NATs
357
+ # {'Name': 'tag:Name', 'Values': [f"{prefix}-nat-gateway"]}
358
+ ]
359
+ )
360
+ nat_gateways = response.get('NatGateways', [])
361
+ except Exception as e:
362
+ print(f"Warning: Could not describe NAT Gateways in VPC '{vpc_id}': {e}")
363
+ # Decide how to handle this error - proceed or raise?
364
+
365
+ # Decide how to identify the specific NAT Gateway you want to check for.
366
+
367
+
368
+
369
+ return vpc_id, nat_gateways
370
+ else:
371
+ print(f"VPC '{vpc_name}' not found.")
372
+ return None
373
+ except Exception as e:
374
+ print(f"An unexpected error occurred finding VPC '{vpc_name}': {e}")
375
+ raise
376
+
377
+ # --- Helper to fetch all existing subnets in a VPC once ---
378
+ def _get_existing_subnets_in_vpc(vpc_id: str) -> Dict[str, Any]:
379
+ """
380
+ Fetches all subnets in a given VPC.
381
+ Returns a dictionary with 'by_name' (map of name to subnet data),
382
+ 'by_id' (map of id to subnet data), and 'cidr_networks' (list of ipaddress.IPv4Network).
383
+ """
384
+ ec2_client = boto3.client('ec2')
385
+ existing_subnets_data = {
386
+ "by_name": {}, # {subnet_name: {'id': 'subnet-id', 'cidr': 'x.x.x.x/x'}}
387
+ "by_id": {}, # {subnet_id: {'name': 'subnet-name', 'cidr': 'x.x.x.x/x'}}
388
+ "cidr_networks": [] # List of ipaddress.IPv4Network objects
389
+ }
390
+ try:
391
+ response = ec2_client.describe_subnets(Filters=[{'Name': 'vpc-id', 'Values': [vpc_id]}])
392
+ for s in response.get('Subnets', []):
393
+ subnet_id = s['SubnetId']
394
+ cidr_block = s.get('CidrBlock')
395
+ # Extract 'Name' tag, which is crucial for lookup by name
396
+ name_tag = next((tag['Value'] for tag in s.get('Tags', []) if tag['Key'] == 'Name'), None)
397
+
398
+ subnet_info = {'id': subnet_id, 'cidr': cidr_block, 'name': name_tag}
399
+
400
+ if name_tag:
401
+ existing_subnets_data["by_name"][name_tag] = subnet_info
402
+ existing_subnets_data["by_id"][subnet_id] = subnet_info
403
+
404
+ if cidr_block:
405
+ try:
406
+ existing_subnets_data["cidr_networks"].append(ipaddress.ip_network(cidr_block, strict=False))
407
+ except ValueError:
408
+ print(f"Warning: Existing subnet {subnet_id} has an invalid CIDR: {cidr_block}. Skipping for overlap check.")
409
+
410
+ print(f"Fetched {len(response.get('Subnets', []))} existing subnets from VPC '{vpc_id}'.")
411
+ except Exception as e:
412
+ print(f"Error describing existing subnets in VPC '{vpc_id}': {e}. Cannot perform full validation.")
413
+ raise # Re-raise if this essential step fails
414
+
415
+ return existing_subnets_data
416
+
417
+ # --- Modified validate_subnet_creation_parameters to take pre-fetched data ---
418
+ def validate_subnet_creation_parameters(
419
+ vpc_id: str,
420
+ proposed_subnets_data: List[Dict[str, str]], # e.g., [{'name': 'my-public-subnet', 'cidr': '10.0.0.0/24', 'az': 'us-east-1a'}]
421
+ existing_aws_subnets_data: Dict[str, Any] # Pre-fetched data from _get_existing_subnets_in_vpc
422
+ ) -> None:
423
+ """
424
+ Validates proposed subnet names and CIDR blocks against existing AWS subnets
425
+ in the specified VPC and against each other.
426
+ This function uses pre-fetched AWS subnet data.
427
+
428
+ Args:
429
+ vpc_id: The ID of the VPC (for logging/error messages).
430
+ proposed_subnets_data: A list of dictionaries, where each dict represents
431
+ a proposed subnet with 'name', 'cidr', and 'az'.
432
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
433
+ (e.g., from _get_existing_subnets_in_vpc).
434
+
435
+ Raises:
436
+ ValueError: If any proposed subnet name or CIDR block
437
+ conflicts with existing AWS resources or other proposed resources.
438
+ """
439
+ if not proposed_subnets_data:
440
+ print("No proposed subnet data provided for validation. Skipping.")
441
+ return
442
+
443
+ print(f"--- Starting pre-synth validation for VPC '{vpc_id}' with proposed subnets ---")
444
+
445
+ print("Existing subnet data:", pd.DataFrame(existing_aws_subnets_data['by_name']))
446
+
447
+ existing_aws_subnet_names = set(existing_aws_subnets_data["by_name"].keys())
448
+ existing_aws_cidr_networks = existing_aws_subnets_data["cidr_networks"]
449
+
450
+ # Sets to track names and list to track networks for internal batch consistency
451
+ proposed_names_seen: set[str] = set()
452
+ proposed_cidr_networks_seen: List[ipaddress.IPv4Network] = []
453
+
454
+ for i, proposed_subnet in enumerate(proposed_subnets_data):
455
+ subnet_name = proposed_subnet.get('name')
456
+ cidr_block_str = proposed_subnet.get('cidr')
457
+ availability_zone = proposed_subnet.get('az')
458
+
459
+ if not all([subnet_name, cidr_block_str, availability_zone]):
460
+ raise ValueError(f"Proposed subnet at index {i} is incomplete. Requires 'name', 'cidr', and 'az'.")
461
+
462
+ # 1. Check for duplicate names within the proposed batch
463
+ if subnet_name in proposed_names_seen:
464
+ raise ValueError(f"Proposed subnet name '{subnet_name}' is duplicated within the input list.")
465
+ proposed_names_seen.add(subnet_name)
466
+
467
+ # 2. Check for duplicate names against existing AWS subnets
468
+ if subnet_name in existing_aws_subnet_names:
469
+ print(f"Proposed subnet name '{subnet_name}' already exists in VPC '{vpc_id}'.")
470
+
471
+ # Parse proposed CIDR
472
+ try:
473
+ proposed_net = ipaddress.ip_network(cidr_block_str, strict=False)
474
+ except ValueError as e:
475
+ raise ValueError(f"Invalid CIDR format '{cidr_block_str}' for proposed subnet '{subnet_name}': {e}")
476
+
477
+ # 3. Check for overlapping CIDRs within the proposed batch
478
+ for existing_proposed_net in proposed_cidr_networks_seen:
479
+ if proposed_net.overlaps(existing_proposed_net):
480
+ raise ValueError(
481
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
482
+ f"overlaps with another proposed CIDR '{str(existing_proposed_net)}' "
483
+ f"within the same batch."
484
+ )
485
+
486
+ # 4. Check for overlapping CIDRs against existing AWS subnets
487
+ for existing_aws_net in existing_aws_cidr_networks:
488
+ if proposed_net.overlaps(existing_aws_net):
489
+ raise ValueError(
490
+ f"Proposed CIDR '{cidr_block_str}' for subnet '{subnet_name}' "
491
+ f"overlaps with an existing AWS subnet CIDR '{str(existing_aws_net)}' "
492
+ f"in VPC '{vpc_id}'."
493
+ )
494
+
495
+ # If all checks pass for this subnet, add its network to the list for subsequent checks
496
+ proposed_cidr_networks_seen.append(proposed_net)
497
+ print(f"Validation successful for proposed subnet '{subnet_name}' with CIDR '{cidr_block_str}'.")
498
+
499
+ print(f"--- All proposed subnets passed pre-synth validation checks for VPC '{vpc_id}'. ---")
500
+
501
+ # --- Modified check_subnet_exists_by_name (Uses pre-fetched data) ---
502
+ def check_subnet_exists_by_name(
503
+ subnet_name: str,
504
+ existing_aws_subnets_data: Dict[str, Any]
505
+ ) -> Tuple[bool, Optional[str]]:
506
+ """
507
+ Checks if a subnet with the given name exists within the pre-fetched data.
508
+
509
+ Args:
510
+ subnet_name: The 'Name' tag value of the subnet to check.
511
+ existing_aws_subnets_data: Dictionary containing existing AWS subnet data
512
+ (e.g., from _get_existing_subnets_in_vpc).
513
+
514
+ Returns:
515
+ A tuple:
516
+ - The first element is True if the subnet exists, False otherwise.
517
+ - The second element is the Subnet ID if found, None otherwise.
518
+ """
519
+ subnet_info = existing_aws_subnets_data["by_name"].get(subnet_name)
520
+ if subnet_info:
521
+ print(f"Subnet '{subnet_name}' found with ID: {subnet_info['id']}")
522
+ return True, subnet_info['id']
523
+ else:
524
+ print(f"Subnet '{subnet_name}' not found.")
525
+ return False, None
526
+
527
+ def create_nat_gateway(
528
+ scope: Construct,
529
+ public_subnet_for_nat: ec2.ISubnet, # Expects a proper ISubnet
530
+ nat_gateway_name: str,
531
+ nat_gateway_id_context_key: str
532
+ ) -> str:
533
+ """
534
+ Creates a single NAT Gateway in the specified public subnet.
535
+ It does not handle lookup from context; the calling stack should do that.
536
+ Returns the CloudFormation Ref of the NAT Gateway ID.
537
+ """
538
+ print(f"Defining a new NAT Gateway '{nat_gateway_name}' in subnet '{public_subnet_for_nat.subnet_id}'.")
539
+
540
+ # Create an Elastic IP for the NAT Gateway
541
+ eip = ec2.CfnEIP(scope, NAT_GATEWAY_EIP_NAME,
542
+ tags=[CfnTag(key="Name", value=NAT_GATEWAY_EIP_NAME)]
543
+ )
544
+
545
+ # Create the NAT Gateway
546
+ nat_gateway_logical_id = nat_gateway_name.replace('-', '') + "NatGateway"
547
+ nat_gateway = ec2.CfnNatGateway(scope, nat_gateway_logical_id,
548
+ subnet_id=public_subnet_for_nat.subnet_id, # Associate with the public subnet
549
+ allocation_id=eip.attr_allocation_id, # Associate with the EIP
550
+ tags=[CfnTag(key="Name", value=nat_gateway_name)]
551
+ )
552
+ # The NAT GW depends on the EIP. The dependency on the subnet is implicit via subnet_id.
553
+ nat_gateway.add_dependency(eip)
554
+
555
+ # *** CRUCIAL: Use CfnOutput to export the ID after deployment ***
556
+ # This is how you will get the ID to put into cdk.context.json
557
+ CfnOutput(scope, "SingleNatGatewayIdOutput",
558
+ value=nat_gateway.ref,
559
+ description=f"Physical ID of the Single NAT Gateway. Add this to cdk.context.json under the key '{nat_gateway_id_context_key}'.",
560
+ export_name=f"{scope.stack_name}-NatGatewayId" # Make export name unique
561
+ )
562
+
563
+ print(f"CDK: Defined new NAT Gateway '{nat_gateway.ref}'. Its physical ID will be available in the stack outputs after deployment.")
564
+ # Return the tokenised reference for use within this synthesis
565
+ return nat_gateway.ref
566
+
567
+ def create_subnets(
568
+ scope: Construct,
569
+ vpc: ec2.IVpc,
570
+ prefix: str,
571
+ subnet_names: List[str],
572
+ cidr_blocks: List[str],
573
+ availability_zones: List[str],
574
+ is_public: bool,
575
+ internet_gateway_id: Optional[str] = None,
576
+ single_nat_gateway_id: Optional[str] = None
577
+ ) -> Tuple[List[ec2.CfnSubnet], List[ec2.CfnRouteTable]]:
578
+ """
579
+ Creates subnets using L2 constructs but returns the underlying L1 Cfn objects
580
+ for backward compatibility.
581
+ """
582
+ # --- Validations remain the same ---
583
+ if not (len(subnet_names) == len(cidr_blocks) == len(availability_zones) > 0):
584
+ raise ValueError("Subnet names, CIDR blocks, and Availability Zones lists must be non-empty and match in length.")
585
+ if is_public and not internet_gateway_id:
586
+ raise ValueError("internet_gateway_id must be provided for public subnets.")
587
+ if not is_public and not single_nat_gateway_id:
588
+ raise ValueError("single_nat_gateway_id must be provided for private subnets when using a single NAT Gateway.")
589
+
590
+ # --- We will populate these lists with the L1 objects to return ---
591
+ created_subnets: List[ec2.CfnSubnet] = []
592
+ created_route_tables: List[ec2.CfnRouteTable] = []
593
+
594
+ subnet_type_tag = "public" if is_public else "private"
595
+
596
+ for i, subnet_name in enumerate(subnet_names):
597
+ logical_id = f"{prefix}{subnet_type_tag.capitalize()}Subnet{i+1}"
598
+
599
+ # 1. Create the L2 Subnet (this is the easy part)
600
+ subnet = ec2.Subnet(
601
+ scope,
602
+ logical_id,
603
+ vpc_id=vpc.vpc_id,
604
+ cidr_block=cidr_blocks[i],
605
+ availability_zone=availability_zones[i],
606
+ map_public_ip_on_launch=is_public
607
+ )
608
+ Tags.of(subnet).add("Name", subnet_name)
609
+ Tags.of(subnet).add("Type", subnet_type_tag)
610
+
611
+ if is_public:
612
+ # The subnet's route_table is automatically created by the L2 Subnet construct
613
+ try:
614
+ subnet.add_route(
615
+ "DefaultInternetRoute", # A logical ID for the CfnRoute resource
616
+ router_id=internet_gateway_id,
617
+ router_type=ec2.RouterType.GATEWAY,
618
+ # destination_cidr_block="0.0.0.0/0" is the default for this method
619
+ )
620
+ except Exception as e:
621
+ print("Could not create IGW route for public subnet due to:", e)
622
+ print(f"CDK: Defined public L2 subnet '{subnet_name}' and added IGW route.")
623
+ else:
624
+ try:
625
+ # Using .add_route() for private subnets as well for consistency
626
+ subnet.add_route(
627
+ "DefaultNatRoute", # A logical ID for the CfnRoute resource
628
+ router_id=single_nat_gateway_id,
629
+ router_type=ec2.RouterType.NAT_GATEWAY,
630
+ )
631
+ except Exception as e:
632
+ print("Could not create NAT gateway route for public subnet due to:", e)
633
+ print(f"CDK: Defined private L2 subnet '{subnet_name}' and added NAT GW route.")
634
+
635
+ route_table = subnet.route_table
636
+
637
+ created_subnets.append(subnet)
638
+ created_route_tables.append(route_table)
639
+
640
+ return created_subnets, created_route_tables
641
+
642
+ def ingress_rule_exists(security_group:str, peer:str, port:str):
643
+ for rule in security_group.connections.security_groups:
644
+ if port:
645
+ if rule.peer == peer and rule.connection == port:
646
+ return True
647
+ else:
648
+ if rule.peer == peer:
649
+ return True
650
+ return False
651
+
652
+ def check_for_existing_user_pool(user_pool_name:str):
653
+ cognito_client = boto3.client("cognito-idp")
654
+ list_pools_response = cognito_client.list_user_pools(MaxResults=60) # MaxResults up to 60
655
+
656
+ # ListUserPools might require pagination if you have more than 60 pools
657
+ # This simple example doesn't handle pagination, which could miss your pool
658
+
659
+ existing_user_pool_id = ""
660
+
661
+ for pool in list_pools_response.get('UserPools', []):
662
+ if pool.get('Name') == user_pool_name:
663
+ existing_user_pool_id = pool['Id']
664
+ print(f"Found existing user pool by name '{user_pool_name}' with ID: {existing_user_pool_id}")
665
+ break # Found the one we're looking for
666
+
667
+ if existing_user_pool_id:
668
+ return True, existing_user_pool_id, pool
669
+ else:
670
+ return False, "", ""
671
+
672
+ def check_for_existing_user_pool_client(user_pool_id: str, user_pool_client_name: str):
673
+ """
674
+ Checks if a Cognito User Pool Client with the given name exists in the specified User Pool.
675
+
676
+ Args:
677
+ user_pool_id: The ID of the Cognito User Pool.
678
+ user_pool_client_name: The name of the User Pool Client to check for.
679
+
680
+ Returns:
681
+ A tuple:
682
+ - True, client_id, client_details if the client exists.
683
+ - False, "", {} otherwise.
684
+ """
685
+ cognito_client = boto3.client("cognito-idp")
686
+ next_token = 'string'
687
+
688
+
689
+ while True:
690
+ try:
691
+ response = cognito_client.list_user_pool_clients(
692
+ UserPoolId=user_pool_id,
693
+ MaxResults=60,
694
+ NextToken=next_token
695
+ )
696
+ except cognito_client.exceptions.ResourceNotFoundException:
697
+ print(f"Error: User pool with ID '{user_pool_id}' not found.")
698
+ return False, "", {}
699
+
700
+ except cognito_client.exceptions.InvalidParameterException:
701
+ print(f"Error: No app clients for '{user_pool_id}' found.")
702
+ return False, "", {}
703
+
704
+ except Exception as e:
705
+ print("Could not check User Pool clients due to:", e)
706
+
707
+ for client in response.get('UserPoolClients', []):
708
+ if client.get('ClientName') == user_pool_client_name:
709
+ print(f"Found existing user pool client '{user_pool_client_name}' with ID: {client['ClientId']}")
710
+ return True, client['ClientId'], client
711
+
712
+ next_token = response.get('NextToken')
713
+ if not next_token:
714
+ break
715
+
716
+ return False, "", {}
717
+
718
+ def check_for_secret(secret_name: str, secret_value: dict=""):
719
+ """
720
+ Checks if a Secrets Manager secret with the given name exists.
721
+ If it doesn't exist, it creates the secret.
722
+
723
+ Args:
724
+ secret_name: The name of the Secrets Manager secret.
725
+ secret_value: A dictionary containing the key-value pairs for the secret.
726
+
727
+ Returns:
728
+ True if the secret existed or was created, False otherwise (due to other errors).
729
+ """
730
+ secretsmanager_client = boto3.client("secretsmanager")
731
+
732
+ try:
733
+ # Try to get the secret. If it doesn't exist, a ResourceNotFoundException will be raised.
734
+ secret_value = secretsmanager_client.get_secret_value(SecretId=secret_name)
735
+ print(f"Secret '{secret_name}' already exists.")
736
+ return True, secret_value
737
+ except secretsmanager_client.exceptions.ResourceNotFoundException:
738
+ print("Secret not found")
739
+ return False, {}
740
+ except Exception as e:
741
+ # Handle other potential exceptions during the get operation
742
+ print(f"Error checking for secret '{secret_name}': {e}")
743
+ return False, {}
744
+
745
+ def check_alb_exists(load_balancer_name: str, region_name: str = None) -> tuple[bool, dict]:
746
+ """
747
+ Checks if an Application Load Balancer (ALB) with the given name exists.
748
+
749
+ Args:
750
+ load_balancer_name: The name of the ALB to check.
751
+ region_name: The AWS region to check in. If None, uses the default
752
+ session region.
753
+
754
+ Returns:
755
+ A tuple:
756
+ - The first element is True if the ALB exists, False otherwise.
757
+ - The second element is the ALB object (dictionary) if found,
758
+ None otherwise. Specifically, it returns the first element of
759
+ the LoadBalancers list from the describe_load_balancers response.
760
+ """
761
+ if region_name:
762
+ elbv2_client = boto3.client('elbv2', region_name=region_name)
763
+ else:
764
+ elbv2_client = boto3.client('elbv2')
765
+ try:
766
+ response = elbv2_client.describe_load_balancers(Names=[load_balancer_name])
767
+ if response['LoadBalancers']:
768
+ return True, response['LoadBalancers'][0] # Return True and the first ALB object
769
+ else:
770
+ return False, {}
771
+ except ClientError as e:
772
+ # If the error indicates the ALB doesn't exist, return False
773
+ if e.response['Error']['Code'] == 'LoadBalancerNotFound':
774
+ return False, {}
775
+ else:
776
+ # Re-raise other exceptions
777
+ raise
778
+ except Exception as e:
779
+ print(f"An unexpected error occurred: {e}")
780
+ return False, {}
781
+
782
+ def check_fargate_task_definition_exists(task_definition_name: str, region_name: str = None) -> tuple[bool, dict]:
783
+ """
784
+ Checks if a Fargate task definition with the given name exists.
785
+
786
+ Args:
787
+ task_definition_name: The name or ARN of the task definition to check.
788
+ region_name: The AWS region to check in. If None, uses the default
789
+ session region.
790
+
791
+ Returns:
792
+ A tuple:
793
+ - The first element is True if the task definition exists, False otherwise.
794
+ - The second element is the task definition object (dictionary) if found,
795
+ None otherwise. Specifically, it returns the first element of the
796
+ taskDefinitions list from the describe_task_definition response.
797
+ """
798
+ if region_name:
799
+ ecs_client = boto3.client('ecs', region_name=region_name)
800
+ else:
801
+ ecs_client = boto3.client('ecs')
802
+ try:
803
+ response = ecs_client.describe_task_definition(taskDefinition=task_definition_name)
804
+ # If describe_task_definition succeeds, it returns the task definition.
805
+ # We can directly return True and the task definition.
806
+ return True, response['taskDefinition']
807
+ except ClientError as e:
808
+ # Check for the error code indicating the task definition doesn't exist.
809
+ if e.response['Error']['Code'] == 'ClientException' and 'Task definition' in e.response['Message'] and 'does not exist' in e.response['Message']:
810
+ return False, {}
811
+ else:
812
+ # Re-raise other exceptions.
813
+ raise
814
+ except Exception as e:
815
+ print(f"An unexpected error occurred: {e}")
816
+ return False, {}
817
+
818
+ def check_ecs_service_exists(cluster_name: str, service_name: str, region_name: str = None) -> tuple[bool, dict]:
819
+ """
820
+ Checks if an ECS service with the given name exists in the specified cluster.
821
+
822
+ Args:
823
+ cluster_name: The name or ARN of the ECS cluster.
824
+ service_name: The name of the ECS service to check.
825
+ region_name: The AWS region to check in. If None, uses the default
826
+ session region.
827
+
828
+ Returns:
829
+ A tuple:
830
+ - The first element is True if the service exists, False otherwise.
831
+ - The second element is the service object (dictionary) if found,
832
+ None otherwise.
833
+ """
834
+ if region_name:
835
+ ecs_client = boto3.client('ecs', region_name=region_name)
836
+ else:
837
+ ecs_client = boto3.client('ecs')
838
+ try:
839
+ response = ecs_client.describe_services(cluster=cluster_name, services=[service_name])
840
+ if response['services']:
841
+ return True, response['services'][0] # Return True and the first service object
842
+ else:
843
+ return False, {}
844
+ except ClientError as e:
845
+ # Check for the error code indicating the service doesn't exist.
846
+ if e.response['Error']['Code'] == 'ClusterNotFoundException':
847
+ return False, {}
848
+ elif e.response['Error']['Code'] == 'ServiceNotFoundException':
849
+ return False, {}
850
+ else:
851
+ # Re-raise other exceptions.
852
+ raise
853
+ except Exception as e:
854
+ print(f"An unexpected error occurred: {e}")
855
+ return False, {}
856
+
857
+ def check_cloudfront_distribution_exists(distribution_name: str, region_name: str = None) -> tuple[bool, dict | None]:
858
+ """
859
+ Checks if a CloudFront distribution with the given name exists.
860
+
861
+ Args:
862
+ distribution_name: The name of the CloudFront distribution to check.
863
+ region_name: The AWS region to check in. If None, uses the default
864
+ session region. Note: CloudFront is a global service,
865
+ so the region is usually 'us-east-1', but this parameter
866
+ is included for completeness.
867
+
868
+ Returns:
869
+ A tuple:
870
+ - The first element is True if the distribution exists, False otherwise.
871
+ - The second element is the distribution object (dictionary) if found,
872
+ None otherwise. Specifically, it returns the first element of the
873
+ DistributionList from the ListDistributions response.
874
+ """
875
+ if region_name:
876
+ cf_client = boto3.client('cloudfront', region_name=region_name)
877
+ else:
878
+ cf_client = boto3.client('cloudfront')
879
+ try:
880
+ response = cf_client.list_distributions()
881
+ if 'Items' in response['DistributionList']:
882
+ for distribution in response['DistributionList']['Items']:
883
+ # CloudFront doesn't directly filter by name, so we have to iterate.
884
+ if distribution['AliasSet']['Items'] and distribution['AliasSet']['Items'][0] == distribution_name:
885
+ return True, distribution
886
+ return False, None
887
+ else:
888
+ return False, None
889
+ except ClientError as e:
890
+ # If the error indicates the Distribution doesn't exist, return False
891
+ if e.response['Error']['Code'] == 'NoSuchDistribution':
892
+ return False, None
893
+ else:
894
+ # Re-raise other exceptions
895
+ raise
896
+ except Exception as e:
897
+ print(f"An unexpected error occurred: {e}")
898
+ return False, None
899
+
900
+ def create_web_acl_with_common_rules(scope:Construct, web_acl_name: str, waf_scope:str="CLOUDFRONT"):
901
+ '''
902
+ Use CDK to create a web ACL based on an AWS common rule set with overrides.
903
+ This function now expects a 'scope' argument, typically 'self' from your stack,
904
+ as CfnWebACL requires a construct scope.
905
+ '''
906
+
907
+ # Create full list of rules
908
+ rules = []
909
+ aws_ruleset_names = [
910
+ "AWSManagedRulesCommonRuleSet",
911
+ "AWSManagedRulesKnownBadInputsRuleSet",
912
+ "AWSManagedRulesAmazonIpReputationList"
913
+ ]
914
+
915
+ # Use a separate counter to assign unique priorities sequentially
916
+ priority_counter = 1
917
+
918
+ for aws_rule_name in aws_ruleset_names:
919
+ current_rule_action_overrides = None
920
+
921
+ # All managed rule groups need an override_action.
922
+ # 'none' means use the managed rule group's default action.
923
+ current_override_action = wafv2.CfnWebACL.OverrideActionProperty(none={})
924
+
925
+ current_priority = priority_counter
926
+ priority_counter += 1
927
+
928
+ if aws_rule_name == "AWSManagedRulesCommonRuleSet":
929
+ current_rule_action_overrides = [
930
+ wafv2.CfnWebACL.RuleActionOverrideProperty(
931
+ name="SizeRestrictions_BODY",
932
+ action_to_use=wafv2.CfnWebACL.RuleActionProperty(
933
+ allow={}
934
+ )
935
+ )
936
+ ]
937
+ # No need to set current_override_action here, it's already set above.
938
+ # If you wanted this specific rule to have a *fixed* priority, you'd handle it differently
939
+ # For now, it will get priority 1 from the counter.
940
+
941
+ rule_property = wafv2.CfnWebACL.RuleProperty(
942
+ name=aws_rule_name,
943
+ priority=current_priority,
944
+ statement=wafv2.CfnWebACL.StatementProperty(
945
+ managed_rule_group_statement=wafv2.CfnWebACL.ManagedRuleGroupStatementProperty(
946
+ vendor_name="AWS",
947
+ name=aws_rule_name,
948
+ rule_action_overrides=current_rule_action_overrides
949
+ )
950
+ ),
951
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
952
+ cloud_watch_metrics_enabled=True,
953
+ metric_name=aws_rule_name,
954
+ sampled_requests_enabled=True
955
+ ),
956
+ override_action=current_override_action # THIS IS THE CRUCIAL PART FOR ALL MANAGED RULES
957
+ )
958
+
959
+ rules.append(rule_property)
960
+
961
+ # Add the rate limit rule
962
+ rate_limit_priority = priority_counter # Use the next available priority
963
+ rules.append(wafv2.CfnWebACL.RuleProperty(
964
+ name="RateLimitRule",
965
+ priority=rate_limit_priority,
966
+ statement=wafv2.CfnWebACL.StatementProperty(
967
+ rate_based_statement=wafv2.CfnWebACL.RateBasedStatementProperty(
968
+ limit=1000,
969
+ aggregate_key_type="IP"
970
+ )
971
+ ),
972
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
973
+ cloud_watch_metrics_enabled=True,
974
+ metric_name="RateLimitRule",
975
+ sampled_requests_enabled=True
976
+ ),
977
+ action=wafv2.CfnWebACL.RuleActionProperty(
978
+ block={}
979
+ )
980
+ ))
981
+
982
+ web_acl = wafv2.CfnWebACL(
983
+ scope,
984
+ "WebACL",
985
+ name=web_acl_name,
986
+ default_action=wafv2.CfnWebACL.DefaultActionProperty(allow={}),
987
+ scope=waf_scope,
988
+ visibility_config=wafv2.CfnWebACL.VisibilityConfigProperty(
989
+ cloud_watch_metrics_enabled=True,
990
+ metric_name="webACL",
991
+ sampled_requests_enabled=True
992
+ ),
993
+ rules=rules
994
+ )
995
+
996
+ CfnOutput(scope, "WebACLArn", value=web_acl.attr_arn)
997
+
998
+ return web_acl
999
+
1000
+ def check_web_acl_exists(web_acl_name: str, scope: str, region_name: str = None) -> tuple[bool, dict]:
1001
+ """
1002
+ Checks if a Web ACL with the given name and scope exists.
1003
+
1004
+ Args:
1005
+ web_acl_name: The name of the Web ACL to check.
1006
+ scope: The scope of the Web ACL ('CLOUDFRONT' or 'REGIONAL').
1007
+ region_name: The AWS region to check in. Required for REGIONAL scope.
1008
+ If None, uses the default session region. For CLOUDFRONT,
1009
+ the region should be 'us-east-1'.
1010
+
1011
+ Returns:
1012
+ A tuple:
1013
+ - The first element is True if the Web ACL exists, False otherwise.
1014
+ - The second element is the Web ACL object (dictionary) if found,
1015
+ None otherwise.
1016
+ """
1017
+ if scope not in ['CLOUDFRONT', 'REGIONAL']:
1018
+ raise ValueError("Scope must be either 'CLOUDFRONT' or 'REGIONAL'")
1019
+
1020
+ if scope == 'REGIONAL' and not region_name:
1021
+ raise ValueError("Region name is required for REGIONAL scope")
1022
+
1023
+ if scope == 'CLOUDFRONT':
1024
+ region_name = 'us-east-1' # CloudFront scope requires us-east-1
1025
+
1026
+ if region_name:
1027
+ waf_client = boto3.client('wafv2', region_name=region_name)
1028
+ else:
1029
+ waf_client = boto3.client('wafv2')
1030
+ try:
1031
+ response = waf_client.list_web_acls(Scope=scope)
1032
+ if 'WebACLs' in response:
1033
+ for web_acl in response['WebACLs']:
1034
+ if web_acl['Name'] == web_acl_name:
1035
+ # Describe the Web ACL to get the full object.
1036
+ describe_response = waf_client.describe_web_acl(Name=web_acl_name, Scope=scope)
1037
+ return True, describe_response['WebACL']
1038
+ return False, {}
1039
+ else:
1040
+ return False, {}
1041
+ except ClientError as e:
1042
+ # Check for the error code indicating the web ACL doesn't exist.
1043
+ if e.response['Error']['Code'] == 'ResourceNotFoundException':
1044
+ return False, {}
1045
+ else:
1046
+ # Re-raise other exceptions.
1047
+ raise
1048
+ except Exception as e:
1049
+ print(f"An unexpected error occurred: {e}")
1050
+ return False, {}
1051
+
1052
+ def add_alb_https_listener_with_cert(
1053
+ scope: Construct,
1054
+ logical_id: str, # A unique ID for this listener construct
1055
+ alb: elb.ApplicationLoadBalancer,
1056
+ acm_certificate_arn: Optional[str], # Optional: If None, no HTTPS listener will be created
1057
+ default_target_group: elb.ITargetGroup, # Mandatory: The target group to forward traffic to
1058
+ listener_port_https: int = 443,
1059
+ listener_open_to_internet: bool = False, # Be cautious with True, ensure ALB security group restricts access
1060
+ # --- Cognito Authentication Parameters ---
1061
+ enable_cognito_auth: bool = False,
1062
+ cognito_user_pool: Optional[cognito.IUserPool] = None,
1063
+ cognito_user_pool_client: Optional[cognito.IUserPoolClient] = None,
1064
+ cognito_user_pool_domain: Optional[str] = None, # E.g., "my-app-domain" for "my-app-domain.auth.region.amazoncognito.com"
1065
+ cognito_auth_scope: Optional[str] = "openid profile email", # Default recommended scope
1066
+ cognito_auth_on_unauthenticated_request: elb.UnauthenticatedAction = elb.UnauthenticatedAction.AUTHENTICATE,
1067
+ stickiness_cookie_duration=None
1068
+ # --- End Cognito Parameters ---
1069
+ ) -> Optional[elb.ApplicationListener]:
1070
+ """
1071
+ Conditionally adds an HTTPS listener to an ALB with an ACM certificate,
1072
+ and optionally enables Cognito User Pool authentication.
1073
+
1074
+ Args:
1075
+ scope (Construct): The scope in which to define this construct (e.g., your CDK Stack).
1076
+ logical_id (str): A unique logical ID for the listener construct within the stack.
1077
+ alb (elb.ApplicationLoadBalancer): The Application Load Balancer to add the listener to.
1078
+ acm_certificate_arn (Optional[str]): The ARN of the ACM certificate to attach.
1079
+ If None, the HTTPS listener will NOT be created.
1080
+ default_target_group (elb.ITargetGroup): The default target group for the listener to forward traffic to.
1081
+ This is mandatory for a functional listener.
1082
+ listener_port_https (int): The HTTPS port to listen on (default: 443).
1083
+ listener_open_to_internet (bool): Whether the listener should allow connections from all sources.
1084
+ If False (recommended), ensure your ALB's security group allows
1085
+ inbound traffic on this port from desired sources.
1086
+ enable_cognito_auth (bool): Set to True to enable Cognito User Pool authentication.
1087
+ cognito_user_pool (Optional[cognito.IUserPool]): The Cognito User Pool object. Required if enable_cognito_auth is True.
1088
+ cognito_user_pool_client (Optional[cognito.IUserPoolClient]): The Cognito User Pool App Client object. Required if enable_cognito_auth is True.
1089
+ cognito_user_pool_domain (Optional[str]): The domain prefix for your Cognito User Pool. Required if enable_cognito_auth is True.
1090
+ cognito_auth_scope (Optional[str]): The scope for the Cognito authentication.
1091
+ cognito_auth_on_unauthenticated_request (elb.UnauthenticatedAction): Action for unauthenticated requests.
1092
+ Defaults to AUTHENTICATE (redirect to login).
1093
+
1094
+ Returns:
1095
+ Optional[elb.ApplicationListener]: The created ApplicationListener if successful,
1096
+ None if no ACM certificate ARN was provided.
1097
+ """
1098
+ https_listener = None
1099
+ if acm_certificate_arn:
1100
+ certificates_list = [elb.ListenerCertificate.from_arn(acm_certificate_arn)]
1101
+ print(f"Attempting to add ALB HTTPS listener on port {listener_port_https} with ACM certificate: {acm_certificate_arn}")
1102
+
1103
+ # Determine the default action based on whether Cognito auth is enabled
1104
+ default_action = None
1105
+ if enable_cognito_auth == True:
1106
+ if not all([cognito_user_pool, cognito_user_pool_client, cognito_user_pool_domain]):
1107
+ raise ValueError(
1108
+ "Cognito User Pool, Client, and Domain must be provided if enable_cognito_auth is True."
1109
+ )
1110
+ print(f"Enabling Cognito authentication with User Pool: {cognito_user_pool.user_pool_id}")
1111
+
1112
+ default_action = elb_act.AuthenticateCognitoAction(
1113
+ next=elb.ListenerAction.forward([default_target_group]), # After successful auth, forward to TG
1114
+ user_pool=cognito_user_pool,
1115
+ user_pool_client=cognito_user_pool_client,
1116
+ user_pool_domain=cognito_user_pool_domain,
1117
+ scope=cognito_auth_scope,
1118
+ on_unauthenticated_request=cognito_auth_on_unauthenticated_request,
1119
+ session_timeout=stickiness_cookie_duration
1120
+ # Additional options you might want to configure:
1121
+ # session_cookie_name="AWSELBCookies"
1122
+ )
1123
+ else:
1124
+ default_action = elb.ListenerAction.forward([default_target_group])
1125
+ print("Cognito authentication is NOT enabled for this listener.")
1126
+
1127
+ # Add the HTTPS listener
1128
+ https_listener = alb.add_listener(
1129
+ logical_id,
1130
+ port=listener_port_https,
1131
+ open=listener_open_to_internet,
1132
+ certificates=certificates_list,
1133
+ default_action=default_action # Use the determined default action
1134
+ )
1135
+ print(f"ALB HTTPS listener on port {listener_port_https} defined.")
1136
+ else:
1137
+ print("ACM_CERTIFICATE_ARN is not provided. Skipping HTTPS listener creation.")
1138
+
1139
+ return https_listener
1140
+
1141
+
1142
+ def ensure_folder_exists(output_folder:str):
1143
+ """Checks if the specified folder exists, creates it if not."""
1144
+
1145
+ if not os.path.exists(output_folder):
1146
+ # Create the folder if it doesn't exist
1147
+ os.makedirs(output_folder, exist_ok=True)
1148
+ print(f"Created the {output_folder} folder.")
1149
+ else:
1150
+ print(f"The {output_folder} folder already exists.")
1151
+
1152
+ def create_basic_config_env(out_dir:str="config", S3_LOG_CONFIG_BUCKET_NAME=S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME=S3_OUTPUT_BUCKET_NAME, ACCESS_LOG_DYNAMODB_TABLE_NAME=ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME=USAGE_LOG_DYNAMODB_TABLE_NAME):
1153
+ '''
1154
+ Create a basic config.env file for the user to use with their newly deployed redaction app.
1155
+ '''
1156
+ variables = {
1157
+ 'COGNITO_AUTH':'1',
1158
+ 'RUN_AWS_FUNCTIONS':'1',
1159
+ 'DISPLAY_FILE_NAMES_IN_LOGS':'False',
1160
+ 'SESSION_OUTPUT_FOLDER':'True',
1161
+ 'SAVE_LOGS_TO_DYNAMODB':'True',
1162
+ 'SHOW_COSTS':'True',
1163
+ 'SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS':'True',
1164
+ 'LOAD_PREVIOUS_TEXTRACT_JOBS_S3':'True',
1165
+ 'DOCUMENT_REDACTION_BUCKET':S3_LOG_CONFIG_BUCKET_NAME,
1166
+ 'TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET':S3_OUTPUT_BUCKET_NAME,
1167
+ 'ACCESS_LOG_DYNAMODB_TABLE_NAME':ACCESS_LOG_DYNAMODB_TABLE_NAME,
1168
+ 'FEEDBACK_LOG_DYNAMODB_TABLE_NAME':FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
1169
+ 'USAGE_LOG_DYNAMODB_TABLE_NAME':USAGE_LOG_DYNAMODB_TABLE_NAME,
1170
+ 'DISPLAY_FILE_NAMES_IN_LOGS':'False'
1171
+ }
1172
+
1173
+ # Write variables to .env file
1174
+ ensure_folder_exists(out_dir + "/")
1175
+ env_file_path = os.path.abspath(os.path.join(out_dir, 'config.env'))
1176
+
1177
+ # It's good practice to ensure the file exists before calling set_key repeatedly.
1178
+ # set_key will create it, but for a loop, it might be cleaner to ensure it's empty/exists once.
1179
+ if not os.path.exists(env_file_path):
1180
+ with open(env_file_path, 'w') as f:
1181
+ pass # Create empty file
1182
+
1183
+ for key, value in variables.items():
1184
+ set_key(env_file_path, key, str(value), quote_mode="never")
1185
+
1186
+ return variables
1187
+
1188
+ def start_codebuild_build(PROJECT_NAME:str, AWS_REGION:str = AWS_REGION):
1189
+ '''
1190
+ Start an existing Codebuild project build
1191
+ '''
1192
+
1193
+ # --- Initialize CodeBuild client ---
1194
+ client = boto3.client('codebuild', region_name=AWS_REGION)
1195
+
1196
+ try:
1197
+ print(f"Attempting to start build for project: {PROJECT_NAME}")
1198
+
1199
+ response = client.start_build(
1200
+ projectName=PROJECT_NAME
1201
+ )
1202
+
1203
+ build_id = response['build']['id']
1204
+ print(f"Successfully started build with ID: {build_id}")
1205
+ print(f"Build ARN: {response['build']['arn']}")
1206
+ print(f"Build URL (approximate - construct based on region and ID):")
1207
+ print(f"https://{AWS_REGION}.console.aws.amazon.com/codesuite/codebuild/projects/{PROJECT_NAME}/build/{build_id.split(':')[-1]}/detail")
1208
+
1209
+ # You can inspect the full response if needed
1210
+ # print("\nFull response:")
1211
+ # import json
1212
+ # print(json.dumps(response, indent=2))
1213
+
1214
+ except client.exceptions.ResourceNotFoundException:
1215
+ print(f"Error: Project '{PROJECT_NAME}' not found in region '{AWS_REGION}'.")
1216
+ except Exception as e:
1217
+ print(f"An unexpected error occurred: {e}")
1218
+
1219
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str, RUN_AWS_FUNCTIONS:str = "1"):
1220
+ """
1221
+ Uploads a file from local machine to Amazon S3.
1222
+
1223
+ Args:
1224
+ - local_file_path: Local file path(s) of the file(s) to upload.
1225
+ - s3_key: Key (path) to the file in the S3 bucket.
1226
+ - s3_bucket: Name of the S3 bucket.
1227
+
1228
+ Returns:
1229
+ - Message as variable/printed to console
1230
+ """
1231
+ final_out_message = []
1232
+ final_out_message_str = ""
1233
+
1234
+ if RUN_AWS_FUNCTIONS == "1":
1235
+ try:
1236
+ if s3_bucket and local_file_paths:
1237
+
1238
+ s3_client = boto3.client('s3', region_name=AWS_REGION)
1239
+
1240
+ if isinstance(local_file_paths, str):
1241
+ local_file_paths = [local_file_paths]
1242
+
1243
+ for file in local_file_paths:
1244
+ if s3_client:
1245
+ #print(s3_client)
1246
+ try:
1247
+ # Get file name off file path
1248
+ file_name = os.path.basename(file)
1249
+
1250
+ s3_key_full = s3_key + file_name
1251
+ print("S3 key: ", s3_key_full)
1252
+
1253
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
1254
+ out_message = "File " + file_name + " uploaded successfully!"
1255
+ print(out_message)
1256
+
1257
+ except Exception as e:
1258
+ out_message = f"Error uploading file(s): {e}"
1259
+ print(out_message)
1260
+
1261
+ final_out_message.append(out_message)
1262
+ final_out_message_str = '\n'.join(final_out_message)
1263
+
1264
+ else: final_out_message_str = "Could not connect to AWS."
1265
+ else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
1266
+ except Exception as e:
1267
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
1268
+ print(final_out_message_str)
1269
+ else:
1270
+ final_out_message_str = "App not set to run AWS functions"
1271
+
1272
+ return final_out_message_str
1273
+
1274
+ # Initialize ECS client
1275
+ def start_ecs_task(cluster_name, service_name):
1276
+ ecs_client = boto3.client('ecs')
1277
+
1278
+ try:
1279
+ # Update the service to set the desired count to 1
1280
+ response = ecs_client.update_service(
1281
+ cluster=cluster_name,
1282
+ service=service_name,
1283
+ desiredCount=1
1284
+ )
1285
+ return {
1286
+ "statusCode": 200,
1287
+ "body": f"Service {service_name} in cluster {cluster_name} has been updated to 1 task."
1288
+ }
1289
+ except Exception as e:
1290
+ return {
1291
+ "statusCode": 500,
1292
+ "body": f"Error updating service: {str(e)}"
1293
+ }
cdk/cdk_stack.py ADDED
@@ -0,0 +1,1317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json # You might still need json if loading task_definition.json
3
+ from typing import List, Dict, Any
4
+ from aws_cdk import (
5
+ Stack,
6
+ CfnTag, # <-- Import CfnTag directly
7
+ CfnOutput, # <-- Import CfnOutput directly
8
+ Duration,
9
+ RemovalPolicy,
10
+ SecretValue,
11
+ aws_ec2 as ec2,
12
+ aws_ecr as ecr,
13
+ aws_s3 as s3,
14
+ aws_ecs as ecs,
15
+ aws_iam as iam,
16
+ aws_codebuild as codebuild,
17
+ aws_cognito as cognito,
18
+ aws_secretsmanager as secretsmanager,
19
+ aws_cloudfront as cloudfront,
20
+ aws_cloudfront_origins as origins,
21
+ aws_elasticloadbalancingv2 as elbv2,
22
+ aws_logs as logs,
23
+ aws_wafv2 as wafv2,
24
+ aws_dynamodb as dynamodb # Import the DynamoDB module
25
+ )
26
+
27
+ from constructs import Construct
28
+ from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME
29
+ from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
30
+
31
+ def _get_env_list(env_var_name: str) -> List[str]:
32
+ """Parses a comma-separated environment variable into a list of strings."""
33
+ value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
34
+ if not value:
35
+ return []
36
+ # Split by comma and filter out any empty strings that might result from extra commas
37
+ return [s.strip() for s in value.split(',') if s.strip()]
38
+
39
+ # 1. Try to load CIDR/AZs from environment variables
40
+ if PUBLIC_SUBNETS_TO_USE: PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
41
+ if PRIVATE_SUBNETS_TO_USE: PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
42
+
43
+ if PUBLIC_SUBNET_CIDR_BLOCKS: PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list("PUBLIC_SUBNET_CIDR_BLOCKS")
44
+ if PUBLIC_SUBNET_AVAILABILITY_ZONES: PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list("PUBLIC_SUBNET_AVAILABILITY_ZONES")
45
+ if PRIVATE_SUBNET_CIDR_BLOCKS: PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list("PRIVATE_SUBNET_CIDR_BLOCKS")
46
+ if PRIVATE_SUBNET_AVAILABILITY_ZONES: PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list("PRIVATE_SUBNET_AVAILABILITY_ZONES")
47
+
48
+ if AWS_MANAGED_TASK_ROLES_LIST: AWS_MANAGED_TASK_ROLES_LIST = _get_env_list(AWS_MANAGED_TASK_ROLES_LIST)
49
+ class CdkStack(Stack):
50
+
51
+ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
52
+ super().__init__(scope, construct_id, **kwargs)
53
+
54
+ # --- Helper to get context values ---
55
+ def get_context_bool(key: str, default: bool = False) -> bool:
56
+ return self.node.try_get_context(key) or default
57
+
58
+ def get_context_str(key: str, default: str = None) -> str:
59
+ return self.node.try_get_context(key) or default
60
+
61
+ def get_context_dict(key: str, default: dict = None) -> dict:
62
+ return self.node.try_get_context(key) or default
63
+
64
+ def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
65
+ ctx_value = self.node.try_get_context(key)
66
+ if not isinstance(ctx_value, list):
67
+ print(f"Warning: Context key '{key}' not found or not a list. Returning empty list.")
68
+ return []
69
+ # Optional: Add validation that all items in the list are dicts
70
+ return ctx_value
71
+
72
+
73
+ # --- VPC and Subnets (Assuming VPC is always lookup, Subnets are created/returned by create_subnets) ---
74
+ # --- VPC Lookup (Always lookup as per your assumption) ---
75
+ try:
76
+ vpc = ec2.Vpc.from_lookup(
77
+ self,
78
+ "VPC",
79
+ vpc_name=VPC_NAME
80
+ )
81
+ print("Successfully looked up VPC:", vpc.vpc_id)
82
+ except Exception as e:
83
+ raise Exception(f"Could not look up VPC with name '{VPC_NAME}' due to: {e}")
84
+
85
+ # --- Subnet Handling (Check Context and Create/Import) ---
86
+ # Initialize lists to hold ISubnet objects (L2) and CfnSubnet/CfnRouteTable (L1)
87
+ # We will store ISubnet for consistency, as CfnSubnet has a .subnet_id property
88
+ self.public_subnets: List[ec2.ISubnet] = []
89
+ self.private_subnets: List[ec2.ISubnet] = []
90
+ # Store L1 CfnRouteTables explicitly if you need to reference them later
91
+ self.private_route_tables_cfn: List[ec2.CfnRouteTable] = []
92
+ self.public_route_tables_cfn: List[ec2.CfnRouteTable] = [] # New: to store public RTs
93
+
94
+ names_to_create_private = []
95
+ names_to_create_public = []
96
+
97
+ if not PUBLIC_SUBNETS_TO_USE and not PRIVATE_SUBNETS_TO_USE:
98
+ print("Warning: No public or private subnets specified in *_SUBNETS_TO_USE. Attempting to select from existing VPC subnets.")
99
+
100
+ print("vpc.public_subnets:", vpc.public_subnets)
101
+ print("vpc.private_subnets:", vpc.private_subnets)
102
+
103
+ # public_subnets_by_az: Dict[str, List[ec2.ISubnet]] = {}
104
+ # private_subnets_by_az: Dict[str, List[ec2.ISubnet]] = {}
105
+
106
+ # Iterate through the subnets exposed by the Vpc L2 construct.
107
+ # for subnet in vpc.public_subnets:
108
+ # az = subnet.availability_zone
109
+ # if az not in public_subnets_by_az:
110
+ # public_subnets_by_az[az] = []
111
+ # public_subnets_by_az[az].append(subnet)
112
+
113
+ selected_public_subnets = vpc.select_subnets(subnet_type=ec2.SubnetType.PUBLIC, one_per_az=True)
114
+ private_subnets_egress = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, one_per_az=True)
115
+ private_subnets_isolated = vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, one_per_az=True)
116
+
117
+ combined_subnet_objects = []
118
+
119
+ if private_subnets_egress.subnets:
120
+ # Add the first PRIVATE_WITH_EGRESS subnet
121
+ combined_subnet_objects.append(private_subnets_egress.subnets[0])
122
+ else:
123
+ self.node.add_warning("No PRIVATE_WITH_EGRESS subnets found to select the first one.")
124
+
125
+ # Add all PRIVATE_ISOLATED subnets *except* the first one (if they exist)
126
+ if len(private_subnets_isolated.subnets) > 1:
127
+ combined_subnet_objects.extend(private_subnets_isolated.subnets[1:])
128
+ elif private_subnets_isolated.subnets: # Only 1 isolated subnet, add a warning if [1:] was desired
129
+ self.node.add_warning("Only one PRIVATE_ISOLATED subnet found, private_subnets_isolated.subnets[1:] will be empty.")
130
+ else:
131
+ self.node.add_warning("No PRIVATE_ISOLATED subnets found.")
132
+
133
+ # Create an ec2.SelectedSubnets object from the combined private subnet list.
134
+ selected_private_subnets = vpc.select_subnets(
135
+ subnets=combined_subnet_objects
136
+ )
137
+
138
+ print("selected_public_subnets:", selected_public_subnets)
139
+ print("selected_private_subnets:", selected_private_subnets)
140
+
141
+
142
+ #self.private_route_tables_cfn = []
143
+
144
+ # for subnet in vpc.private_subnets:
145
+ # az = subnet.availability_zone
146
+ # if az not in private_subnets_by_az:
147
+ # private_subnets_by_az[az] = []
148
+ # private_subnets_by_az[az].append(subnet)
149
+
150
+ #selected_public_subnets: List[ec2.ISubnet] = []
151
+ #selected_private_subnets: List[ec2.ISubnet] = []
152
+
153
+ # Select one public subnet per AZ, preferring the first one found
154
+ # for az in sorted(public_subnets_by_az.keys()):
155
+ # if public_subnets_by_az[az]:
156
+ # selected_public_subnets.append(public_subnets_by_az[az][0])
157
+ # print(f"Selected existing public subnet: {public_subnets_by_az[az][0].subnet_id} from AZ {az}.")
158
+
159
+ # Select one private subnet per AZ, preferring the first one found
160
+ # for az in sorted(private_subnets_by_az.keys()):
161
+ # if private_subnets_by_az[az]:
162
+ # selected_private_subnets.append(private_subnets_by_az[az][0])
163
+ # print(f"Selected existing private subnet: {private_subnets_by_az[az][0].subnet_id} from AZ {az}.")
164
+
165
+ if len(selected_public_subnets.subnet_ids) < 2 or len(selected_private_subnets.subnet_ids) < 2:
166
+ raise Exception("Need at least two public or private subnets in different availability zones")
167
+
168
+ if not selected_public_subnets and not selected_private_subnets:
169
+ # If no subnets could be found even with automatic selection, raise an error.
170
+ # This ensures the stack doesn't proceed if it absolutely needs subnets.
171
+ print("Error: No existing public or private subnets could be found in the VPC for automatic selection. "
172
+ "You must either specify subnets in *_SUBNETS_TO_USE or ensure the VPC has discoverable subnets.")
173
+ raise RuntimeError("No suitable subnets found for automatic selection.")
174
+ else:
175
+ self.public_subnets = selected_public_subnets.subnets
176
+ self.private_subnets = selected_private_subnets.subnets
177
+ print(f"Automatically selected {len(self.public_subnets)} public and {len(self.private_subnets)} private subnets based on VPC discovery.")
178
+
179
+ print("self.public_subnets:", self.public_subnets)
180
+ print("self.private_subnets:", self.private_subnets)
181
+ # Since subnets are now assigned, we can exit this processing block.
182
+ # The rest of the original code (which iterates *_SUBNETS_TO_USE) will be skipped.
183
+
184
+ checked_public_subnets_ctx = get_context_dict("checked_public_subnets")
185
+ checked_private_subnets_ctx = get_context_dict("checked_private_subnets")
186
+
187
+ public_subnets_data_for_creation_ctx = get_context_list_of_dicts("public_subnets_to_create")
188
+ private_subnets_data_for_creation_ctx = get_context_list_of_dicts("private_subnets_to_create")
189
+
190
+ # --- 3. Process Public Subnets ---
191
+ print("\n--- Processing Public Subnets ---")
192
+ # Import existing public subnets
193
+ if checked_public_subnets_ctx:
194
+ for i, subnet_name in enumerate(PUBLIC_SUBNETS_TO_USE):
195
+ subnet_info = checked_public_subnets_ctx.get(subnet_name)
196
+ if subnet_info and subnet_info.get("exists"):
197
+ subnet_id = subnet_info.get("id")
198
+ if not subnet_id:
199
+ raise RuntimeError(f"Context for existing public subnet '{subnet_name}' is missing 'id'.")
200
+ try:
201
+ imported_subnet = ec2.Subnet.from_subnet_id(
202
+ self, f"ImportedPublicSubnet{subnet_name.replace('-', '')}{i}", subnet_id
203
+ )
204
+ #self.public_subnets.append(imported_subnet)
205
+ print(f"Imported existing public subnet: {subnet_name} (ID: {subnet_id})")
206
+ except Exception as e:
207
+ raise RuntimeError(f"Failed to import public subnet '{subnet_name}' with ID '{subnet_id}'. Error: {e}")
208
+
209
+ # Create new public subnets based on public_subnets_data_for_creation_ctx
210
+ if public_subnets_data_for_creation_ctx:
211
+ names_to_create_public = [s['name'] for s in public_subnets_data_for_creation_ctx]
212
+ cidrs_to_create_public = [s['cidr'] for s in public_subnets_data_for_creation_ctx]
213
+ azs_to_create_public = [s['az'] for s in public_subnets_data_for_creation_ctx]
214
+
215
+ if names_to_create_public:
216
+ print(f"Attempting to create {len(names_to_create_public)} new public subnets: {names_to_create_public}")
217
+ newly_created_public_subnets, newly_created_public_rts_cfn = create_subnets(
218
+ self, vpc, CDK_PREFIX, names_to_create_public, cidrs_to_create_public, azs_to_create_public,
219
+ is_public=True,
220
+ internet_gateway_id=EXISTING_IGW_ID
221
+ )
222
+ self.public_subnets.extend(newly_created_public_subnets)
223
+ self.public_route_tables_cfn.extend(newly_created_public_rts_cfn)
224
+
225
+ if not self.public_subnets:
226
+ raise Exception("No public subnets found or created, exiting.")
227
+
228
+
229
+ # --- NAT Gateway Creation/Lookup ---
230
+ self.single_nat_gateway_id = None
231
+
232
+ nat_gw_id_from_context = SINGLE_NAT_GATEWAY_ID
233
+
234
+ if nat_gw_id_from_context:
235
+ print(f"Using existing NAT Gateway ID from context: {nat_gw_id_from_context}")
236
+ self.single_nat_gateway_id = nat_gw_id_from_context
237
+ else:
238
+ # If not in context, create a new one, but only if we have a public subnet.
239
+ if self.public_subnets:
240
+ print("NAT Gateway ID not found in context. Creating a new one.")
241
+ # Place the NAT GW in the first available public subnet
242
+ first_public_subnet = self.public_subnets[0]
243
+
244
+ self.single_nat_gateway_id = create_nat_gateway(
245
+ self,
246
+ first_public_subnet,
247
+ nat_gateway_name=NAT_GATEWAY_NAME,
248
+ nat_gateway_id_context_key=SINGLE_NAT_GATEWAY_ID
249
+ )
250
+ else:
251
+ print("WARNING: No public subnets available. Cannot create a NAT Gateway.")
252
+
253
+
254
+ # --- 4. Process Private Subnets ---
255
+ print("\n--- Processing Private Subnets ---")
256
+ # ... (rest of your existing subnet processing logic for checked_private_subnets_ctx) ...
257
+ # (This part for importing existing subnets remains the same)
258
+
259
+ # Create new private subnets
260
+ if private_subnets_data_for_creation_ctx:
261
+ names_to_create_private = [s['name'] for s in private_subnets_data_for_creation_ctx]
262
+ cidrs_to_create_private = [s['cidr'] for s in private_subnets_data_for_creation_ctx]
263
+ azs_to_create_private = [s['az'] for s in private_subnets_data_for_creation_ctx]
264
+
265
+ if names_to_create_private:
266
+ print(f"Attempting to create {len(names_to_create_private)} new private subnets: {names_to_create_private}")
267
+ # --- CALL THE NEW CREATE_SUBNETS FUNCTION FOR PRIVATE ---
268
+ # Ensure self.single_nat_gateway_id is available before this call
269
+ if not self.single_nat_gateway_id:
270
+ raise ValueError("A single NAT Gateway ID is required for private subnets but was not resolved.")
271
+
272
+ newly_created_private_subnets_cfn, newly_created_private_rts_cfn = create_subnets(
273
+ self, vpc, CDK_PREFIX, names_to_create_private, cidrs_to_create_private, azs_to_create_private,
274
+ is_public=False,
275
+ single_nat_gateway_id=self.single_nat_gateway_id # Pass the single NAT Gateway ID
276
+ )
277
+ self.private_subnets.extend(newly_created_private_subnets_cfn)
278
+ self.private_route_tables_cfn.extend(newly_created_private_rts_cfn)
279
+ print(f"Successfully defined {len(newly_created_private_subnets_cfn)} new private subnets and their route tables for creation.")
280
+ else:
281
+ print("No private subnets specified for creation in context ('private_subnets_to_create').")
282
+
283
+ if not self.private_subnets:
284
+ raise Exception("No private subnets found or created, exiting.")
285
+
286
+ # --- 5. Sanity Check and Output ---
287
+
288
+ # Output the single NAT Gateway ID for verification
289
+ if self.single_nat_gateway_id:
290
+ CfnOutput(self, "SingleNatGatewayId", value=self.single_nat_gateway_id,
291
+ description="ID of the single NAT Gateway used for private subnets.")
292
+ else:
293
+ raise Exception("No single NAT Gateway was created or resolved.")
294
+
295
+ # --- Outputs for other stacks/regions ---
296
+ # These are crucial for cross-stack, cross-region referencing
297
+
298
+ self.params = dict()
299
+ self.params["vpc_id"] = vpc.vpc_id
300
+ self.params["private_subnets"] = self.private_subnets
301
+ self.params["private_route_tables"] = self.private_route_tables_cfn
302
+ self.params["public_subnets"] = self.public_subnets
303
+ self.params["public_route_tables"] = self.public_route_tables_cfn
304
+
305
+
306
+ #class CdkStackMain(Stack):
307
+ # def __init__(self, scope: Construct, construct_id: str, private_subnets:List[ec2.ISubnet]=[], private_route_tables: List[ec2.CfnRouteTable]=[], public_subnets:List[ec2.ISubnet]=[], public_route_tables: List[ec2.CfnRouteTable]=[], **kwargs) -> None:
308
+ # super().__init__(scope, construct_id, **kwargs)
309
+
310
+ # --- Helper to get context values ---
311
+ # def get_context_bool(key: str, default: bool = False) -> bool:
312
+ # return self.node.try_get_context(key) or default
313
+
314
+ # def get_context_str(key: str, default: str = None) -> str:
315
+ # return self.node.try_get_context(key) or default
316
+
317
+ # def get_context_dict(key: str, default: dict = None) -> dict:
318
+ # return self.node.try_get_context(key) or default
319
+
320
+ # def get_context_list_of_dicts(key: str) -> List[Dict[str, Any]]:
321
+ # ctx_value = self.node.try_get_context(key)
322
+
323
+ # if not isinstance(ctx_value, list):
324
+ # print(f"Warning: Context key '{key}' not found or not a list. Returning empty list.")
325
+ # return []
326
+ # # Optional: Add validation that all items in the list are dicts
327
+ # return ctx_value
328
+
329
+ # self.private_subnets: List[ec2.ISubnet] = private_subnets
330
+ # self.private_route_tables_cfn: List[ec2.CfnRouteTable] = private_route_tables
331
+ # self.public_subnets: List[ec2.ISubnet] = public_subnets
332
+ # self.public_route_tables_cfn: List[ec2.CfnRouteTable] = public_route_tables
333
+
334
+ private_subnet_selection = ec2.SubnetSelection(subnets=self.private_subnets)
335
+ public_subnet_selection = ec2.SubnetSelection(subnets=self.public_subnets)
336
+
337
+ for sub in private_subnet_selection.subnets:
338
+ print("private subnet:", sub.subnet_id, "is in availability zone:", sub.availability_zone)
339
+
340
+ for sub in public_subnet_selection.subnets:
341
+ print("public subnet:", sub.subnet_id, "is in availability zone:", sub.availability_zone)
342
+
343
+ # try:
344
+ # vpc = ec2.Vpc.from_lookup(
345
+ # self,
346
+ # "VPC",
347
+ # vpc_name=VPC_NAME
348
+ # )
349
+ # print("Successfully looked up VPC")
350
+ # except Exception as e:
351
+ # raise Exception(f"Could not look up VPC with name '{VPC_NAME}' due to: {e}")
352
+
353
+ print("Private subnet route tables:", self.private_route_tables_cfn)
354
+
355
+ # Add the S3 Gateway Endpoint to the VPC
356
+ if names_to_create_private:
357
+ try:
358
+ s3_gateway_endpoint = vpc.add_gateway_endpoint(
359
+ "S3GatewayEndpoint",
360
+ service=ec2.GatewayVpcEndpointAwsService.S3, subnets=[private_subnet_selection])
361
+ except Exception as e:
362
+ print("Could not add S3 gateway endpoint to subnets due to:", e)
363
+
364
+ #Output some useful information
365
+ CfnOutput(self, "VpcIdOutput", value=vpc.vpc_id,
366
+ description="The ID of the VPC where the S3 Gateway Endpoint is deployed.")
367
+ CfnOutput(self, "S3GatewayEndpointService", value=s3_gateway_endpoint.vpc_endpoint_id,
368
+ description="The id for the S3 Gateway Endpoint.") # Specify the S3 service
369
+
370
+ # --- IAM Roles ---
371
+ try:
372
+ codebuild_role_name = CODEBUILD_ROLE_NAME
373
+ custom_sts_kms_policy = """{
374
+ "Version": "2012-10-17",
375
+ "Statement": [
376
+ {
377
+ "Sid": "STSCallerIdentity",
378
+ "Effect": "Allow",
379
+ "Action": [
380
+ "sts:GetCallerIdentity"
381
+ ],
382
+ "Resource": "*"
383
+ },
384
+ {
385
+ "Sid": "KMSAccess",
386
+ "Effect": "Allow",
387
+ "Action": [
388
+ "kms:Encrypt",
389
+ "kms:Decrypt",
390
+ "kms:GenerateDataKey"
391
+ ],
392
+ "Resource": "*"
393
+ }
394
+ ]
395
+ }"""
396
+
397
+ if get_context_bool(f"exists:{codebuild_role_name}"):
398
+ # If exists, lookup/import the role using ARN from context
399
+ role_arn = get_context_str(f"arn:{codebuild_role_name}")
400
+ if not role_arn:
401
+ raise ValueError(f"Context value 'arn:{codebuild_role_name}' is required if role exists.")
402
+ codebuild_role = iam.Role.from_role_arn(self, "CodeBuildRole", role_arn=role_arn)
403
+ print("Using existing CodeBuild role")
404
+ else:
405
+ # If not exists, create the role
406
+ codebuild_role = iam.Role(
407
+ self, "CodeBuildRole", # Logical ID
408
+ role_name=codebuild_role_name, # Explicit resource name
409
+ assumed_by=iam.ServicePrincipal("codebuild.amazonaws.com")
410
+ )
411
+ codebuild_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name(f"EC2InstanceProfileForImageBuilderECRContainerBuilds"))
412
+ print("Successfully created new CodeBuild role")
413
+
414
+ task_role_name = ECS_TASK_ROLE_NAME
415
+ if get_context_bool(f"exists:{task_role_name}"):
416
+ role_arn = get_context_str(f"arn:{task_role_name}")
417
+ if not role_arn:
418
+ raise ValueError(f"Context value 'arn:{task_role_name}' is required if role exists.")
419
+ task_role = iam.Role.from_role_arn(self, "TaskRole", role_arn=role_arn)
420
+ print("Using existing ECS task role")
421
+ else:
422
+ task_role = iam.Role(
423
+ self, "TaskRole", # Logical ID
424
+ role_name=task_role_name, # Explicit resource name
425
+ assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com")
426
+ )
427
+ for role in AWS_MANAGED_TASK_ROLES_LIST:
428
+ print(f"Adding {role} to policy")
429
+ task_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}"))
430
+ task_role = add_custom_policies(self, task_role, custom_policy_text=custom_sts_kms_policy)
431
+ print("Successfully created new ECS task role")
432
+
433
+ execution_role_name = ECS_TASK_EXECUTION_ROLE_NAME
434
+ if get_context_bool(f"exists:{execution_role_name}"):
435
+ role_arn = get_context_str(f"arn:{execution_role_name}")
436
+ if not role_arn:
437
+ raise ValueError(f"Context value 'arn:{execution_role_name}' is required if role exists.")
438
+ execution_role = iam.Role.from_role_arn(self, "ExecutionRole", role_arn=role_arn)
439
+ print("Using existing ECS execution role")
440
+ else:
441
+ execution_role = iam.Role(
442
+ self, "ExecutionRole", # Logical ID
443
+ role_name=execution_role_name, # Explicit resource name
444
+ assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com")
445
+ )
446
+ for role in AWS_MANAGED_TASK_ROLES_LIST:
447
+ execution_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name(f"{role}"))
448
+ execution_role = add_custom_policies(self, execution_role, custom_policy_text=custom_sts_kms_policy)
449
+ print("Successfully created new ECS execution role")
450
+
451
+ except Exception as e:
452
+ raise Exception("Failed at IAM role step due to:", e)
453
+
454
+ # --- S3 Buckets ---
455
+ try:
456
+ log_bucket_name = S3_LOG_CONFIG_BUCKET_NAME
457
+ if get_context_bool(f"exists:{log_bucket_name}"):
458
+ bucket = s3.Bucket.from_bucket_name(self, "LogConfigBucket", bucket_name=log_bucket_name)
459
+ print("Using existing S3 bucket", log_bucket_name)
460
+ else:
461
+ bucket = s3.Bucket(self, "LogConfigBucket", bucket_name=log_bucket_name,
462
+ versioned=False, # Set to True if you need versioning
463
+ # IMPORTANT: Set removal_policy to DESTROY
464
+ removal_policy=RemovalPolicy.DESTROY,
465
+ # IMPORTANT: Set auto_delete_objects to True to empty the bucket before deletion
466
+ auto_delete_objects=True
467
+ ) # Explicitly set bucket_name
468
+ print("Created S3 bucket", log_bucket_name)
469
+
470
+ # Add policies - this will apply to both created and imported buckets
471
+ # CDK handles idempotent policy additions
472
+ bucket.add_to_resource_policy(
473
+ iam.PolicyStatement(
474
+ effect=iam.Effect.ALLOW,
475
+ principals=[task_role], # Pass the role object directly
476
+ actions=["s3:GetObject", "s3:PutObject"],
477
+ resources=[f"{bucket.bucket_arn}/*"]
478
+ )
479
+ )
480
+ bucket.add_to_resource_policy(
481
+ iam.PolicyStatement(
482
+ effect=iam.Effect.ALLOW,
483
+ principals=[task_role],
484
+ actions=["s3:ListBucket"],
485
+ resources=[bucket.bucket_arn]
486
+ )
487
+ )
488
+
489
+ output_bucket_name = S3_OUTPUT_BUCKET_NAME
490
+ if get_context_bool(f"exists:{output_bucket_name}"):
491
+ output_bucket = s3.Bucket.from_bucket_name(self, "OutputBucket", bucket_name=output_bucket_name)
492
+ print("Using existing Output bucket", output_bucket_name)
493
+ else:
494
+ output_bucket = s3.Bucket(self, "OutputBucket", bucket_name=output_bucket_name,
495
+ lifecycle_rules=[
496
+ s3.LifecycleRule(
497
+ expiration=Duration.days(int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS))
498
+ )
499
+ ],
500
+ versioned=False, # Set to True if you need versioning
501
+ # IMPORTANT: Set removal_policy to DESTROY
502
+ removal_policy=RemovalPolicy.DESTROY,
503
+ # IMPORTANT: Set auto_delete_objects to True to empty the bucket before deletion
504
+ auto_delete_objects=True
505
+ )
506
+ print("Created Output bucket:", output_bucket_name)
507
+
508
+ # Add policies to output bucket
509
+ output_bucket.add_to_resource_policy(
510
+ iam.PolicyStatement(
511
+ effect=iam.Effect.ALLOW,
512
+ principals=[task_role],
513
+ actions=["s3:GetObject", "s3:PutObject"],
514
+ resources=[f"{output_bucket.bucket_arn}/*"]
515
+ )
516
+ )
517
+ output_bucket.add_to_resource_policy(
518
+ iam.PolicyStatement(
519
+ effect=iam.Effect.ALLOW,
520
+ principals=[task_role],
521
+ actions=["s3:ListBucket"],
522
+ resources=[output_bucket.bucket_arn]
523
+ )
524
+ )
525
+
526
+ except Exception as e:
527
+ raise Exception("Could not handle S3 buckets due to:", e)
528
+
529
+ # --- Elastic Container Registry ---
530
+ try:
531
+ full_ecr_repo_name = ECR_CDK_REPO_NAME
532
+ if get_context_bool(f"exists:{full_ecr_repo_name}"):
533
+ ecr_repo = ecr.Repository.from_repository_name(self, "ECRRepo", repository_name=full_ecr_repo_name)
534
+ print("Using existing ECR repository")
535
+ else:
536
+ ecr_repo = ecr.Repository(self, "ECRRepo", repository_name=full_ecr_repo_name) # Explicitly set repository_name
537
+ print("Created ECR repository", full_ecr_repo_name)
538
+
539
+ ecr_image_loc = ecr_repo.repository_uri
540
+ except Exception as e:
541
+ raise Exception("Could not handle ECR repo due to:", e)
542
+
543
+ # --- CODEBUILD ---
544
+ try:
545
+ codebuild_project_name = CODEBUILD_PROJECT_NAME
546
+ if get_context_bool(f"exists:{codebuild_project_name}"):
547
+ # Lookup CodeBuild project by ARN from context
548
+ project_arn = get_context_str(f"arn:{codebuild_project_name}")
549
+ if not project_arn:
550
+ raise ValueError(f"Context value 'arn:{codebuild_project_name}' is required if project exists.")
551
+ codebuild_project = codebuild.Project.from_project_arn(self, "CodeBuildProject", project_arn=project_arn)
552
+ print("Using existing CodeBuild project")
553
+ else:
554
+ codebuild_project = codebuild.Project(self,
555
+ "CodeBuildProject", # Logical ID
556
+ project_name=codebuild_project_name, # Explicit resource name
557
+ source=codebuild.Source.git_hub(
558
+ owner=GITHUB_REPO_USERNAME,
559
+ repo=GITHUB_REPO_NAME,
560
+ branch_or_ref=GITHUB_REPO_BRANCH
561
+ ),
562
+ environment=codebuild.BuildEnvironment(
563
+ build_image=codebuild.LinuxBuildImage.STANDARD_7_0,
564
+ privileged=True,
565
+ environment_variables={"ECR_REPO_NAME": codebuild.BuildEnvironmentVariable(value=full_ecr_repo_name),
566
+ "AWS_DEFAULT_REGION": codebuild.BuildEnvironmentVariable(value=AWS_REGION),
567
+ "AWS_ACCOUNT_ID": codebuild.BuildEnvironmentVariable(value=AWS_ACCOUNT_ID)}
568
+ ),
569
+ build_spec=codebuild.BuildSpec.from_object({
570
+ "version": "0.2",
571
+ "phases": {
572
+ "pre_build": {
573
+ "commands": [
574
+ "echo Logging in to Amazon ECR",
575
+ "aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
576
+ ]
577
+ },
578
+ "build": {
579
+ "commands": [
580
+ "echo Building the Docker image",
581
+ "docker build -t $ECR_REPO_NAME:latest .",
582
+ "docker tag $ECR_REPO_NAME:latest $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest"
583
+ ]
584
+ },
585
+ "post_build": {
586
+ "commands": [
587
+ "echo Pushing the Docker image",
588
+ "docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO_NAME:latest"
589
+ ]
590
+ }
591
+ }
592
+ })
593
+ )
594
+ print("Successfully created CodeBuild project", codebuild_project_name)
595
+
596
+ # Grant permissions - applies to both created and imported project role
597
+ ecr_repo.grant_pull_push(codebuild_project.role)
598
+
599
+ except Exception as e:
600
+ raise Exception("Could not handle Codebuild project due to:", e)
601
+
602
+ # --- Security Groups ---
603
+ try:
604
+ ecs_security_group_name = ECS_SECURITY_GROUP_NAME
605
+ # Following checks by name don't really work
606
+ # Use CDK's from_lookup_by_name which handles lookup or throws an error if not found
607
+ #try:
608
+ # ecs_security_group = ec2.SecurityGroup.from_lookup_by_name(
609
+ # self, "ECSSecurityGroup", vpc=vpc, security_group_name=ecs_security_group_name
610
+ # )
611
+ # print(f"Using existing Security Group: {ecs_security_group_name}")
612
+ # except Exception: # If lookup fails, create
613
+ try:
614
+ ecs_security_group = ec2.SecurityGroup(
615
+ self,
616
+ "ECSSecurityGroup", # Logical ID
617
+ security_group_name=ecs_security_group_name, # Explicit resource name
618
+ vpc=vpc,
619
+ )
620
+ print(f"Created Security Group: {ecs_security_group_name}")
621
+ except Exception as e: # If lookup fails, create
622
+ print("Failed to create ECS security group due to:", e)
623
+
624
+ alb_security_group_name = ALB_NAME_SECURITY_GROUP_NAME
625
+ # try:
626
+ # alb_security_group = ec2.SecurityGroup.from_lookup_by_name(
627
+ # self, "ALBSecurityGroup", vpc=vpc, security_group_name=alb_security_group_name
628
+ # )
629
+ # print(f"Using existing Security Group: {alb_security_group_name}")
630
+ # except Exception: # If lookup fails, create
631
+ try:
632
+ alb_security_group = ec2.SecurityGroup(
633
+ self,
634
+ "ALBSecurityGroup", # Logical ID
635
+ security_group_name=alb_security_group_name, # Explicit resource name
636
+ vpc=vpc
637
+ )
638
+ print(f"Created Security Group: {alb_security_group_name}")
639
+ except Exception as e: # If lookup fails, create
640
+ print("Failed to create ALB security group due to:", e)
641
+
642
+ # Define Ingress Rules - CDK will manage adding/removing these as needed
643
+ ec2_port_gradio_server_port = ec2.Port.tcp(int(GRADIO_SERVER_PORT)) # Ensure port is int
644
+ ecs_security_group.add_ingress_rule(
645
+ peer=alb_security_group,
646
+ connection=ec2_port_gradio_server_port,
647
+ description="ALB traffic",
648
+ )
649
+
650
+ alb_security_group.add_ingress_rule(
651
+ peer=ec2.Peer.prefix_list("pl-93a247fa"),
652
+ connection=ec2.Port.all_traffic(),
653
+ description="CloudFront traffic",
654
+ )
655
+
656
+ except Exception as e:
657
+ raise Exception("Could not handle security groups due to:", e)
658
+
659
+
660
+ # --- DynamoDB tables for logs (optional) ---
661
+
662
+ if SAVE_LOGS_TO_DYNAMODB == 'True':
663
+ try:
664
+ print("Creating DynamoDB tables for logs")
665
+
666
+ dynamodb_table_access = dynamodb.Table(self, "RedactionAccessDataTable",
667
+ table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME,
668
+ partition_key=dynamodb.Attribute(
669
+ name="id",
670
+ type=dynamodb.AttributeType.STRING),
671
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
672
+ removal_policy=RemovalPolicy.DESTROY)
673
+
674
+ dynamodb_table_feedback = dynamodb.Table(self, "RedactionFeedbackDataTable",
675
+ table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME,
676
+ partition_key=dynamodb.Attribute(
677
+ name="id",
678
+ type=dynamodb.AttributeType.STRING),
679
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
680
+ removal_policy=RemovalPolicy.DESTROY)
681
+
682
+ dynamodb_table_usage = dynamodb.Table(self, "RedactionUsageDataTable",
683
+ table_name=USAGE_LOG_DYNAMODB_TABLE_NAME,
684
+ partition_key=dynamodb.Attribute(
685
+ name="id",
686
+ type=dynamodb.AttributeType.STRING),
687
+ billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
688
+ removal_policy=RemovalPolicy.DESTROY)
689
+
690
+ except Exception as e:
691
+ raise Exception("Could not create DynamoDB tables due to:", e)
692
+
693
+ # --- ALB ---
694
+ try:
695
+ load_balancer_name = ALB_NAME
696
+ if len(load_balancer_name) > 32: load_balancer_name = load_balancer_name[-32:]
697
+ if get_context_bool(f"exists:{load_balancer_name}"):
698
+ # Lookup ALB by ARN from context
699
+ alb_arn = get_context_str(f"arn:{load_balancer_name}")
700
+ if not alb_arn:
701
+ raise ValueError(f"Context value 'arn:{load_balancer_name}' is required if ALB exists.")
702
+ alb = elbv2.ApplicationLoadBalancer.from_lookup(
703
+ self, "ALB", # Logical ID
704
+ load_balancer_arn=alb_arn
705
+ )
706
+ print(f"Using existing Application Load Balancer {load_balancer_name}.")
707
+ else:
708
+ alb = elbv2.ApplicationLoadBalancer(
709
+ self,
710
+ "ALB", # Logical ID
711
+ load_balancer_name=load_balancer_name, # Explicit resource name
712
+ vpc=vpc,
713
+ internet_facing=True,
714
+ security_group=alb_security_group, # Link to SG
715
+ vpc_subnets=public_subnet_selection # Link to subnets
716
+ )
717
+ print("Successfully created new Application Load Balancer")
718
+ except Exception as e:
719
+ raise Exception("Could not handle application load balancer due to:", e)
720
+
721
+
722
+
723
+ # --- Cognito User Pool ---
724
+ try:
725
+ if get_context_bool(f"exists:{COGNITO_USER_POOL_NAME}"):
726
+ # Lookup by ID from context
727
+ user_pool_id = get_context_str(f"id:{COGNITO_USER_POOL_NAME}")
728
+ if not user_pool_id:
729
+ raise ValueError(f"Context value 'id:{COGNITO_USER_POOL_NAME}' is required if User Pool exists.")
730
+ user_pool = cognito.UserPool.from_user_pool_id(self, "UserPool", user_pool_id=user_pool_id)
731
+ print(f"Using existing user pool {user_pool_id}.")
732
+ else:
733
+ user_pool = cognito.UserPool(self, "UserPool",
734
+ user_pool_name=COGNITO_USER_POOL_NAME,
735
+ mfa=cognito.Mfa.OFF, # Adjust as needed
736
+ sign_in_aliases=cognito.SignInAliases(email=True),
737
+ removal_policy=RemovalPolicy.DESTROY) # Adjust as needed
738
+ print(f"Created new user pool {user_pool.user_pool_id}.")
739
+
740
+ # If you're using a certificate, assume that you will be using the ALB Cognito login features. You need different redirect URLs to accept the token that comes from Cognito authentication.
741
+ if ACM_CERTIFICATE_ARN:
742
+ redirect_uris = [COGNITO_REDIRECTION_URL, COGNITO_REDIRECTION_URL + "/oauth2/idpresponse"]
743
+ else:
744
+ redirect_uris = [COGNITO_REDIRECTION_URL]
745
+
746
+ user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
747
+ if get_context_bool(f"exists:{user_pool_client_name}"):
748
+ # Lookup by ID from context (requires User Pool object)
749
+ user_pool_client_id = get_context_str(f"id:{user_pool_client_name}")
750
+ if not user_pool_client_id:
751
+ raise ValueError(f"Context value 'id:{user_pool_client_name}' is required if User Pool Client exists.")
752
+ user_pool_client = cognito.UserPoolClient.from_user_pool_client_id(self, "UserPoolClient", user_pool_client_id=user_pool_client_id)
753
+ print(f"Using existing user pool client {user_pool_client_id}.")
754
+ else:
755
+ user_pool_client = cognito.UserPoolClient(self, "UserPoolClient",
756
+ auth_flows=cognito.AuthFlow(user_srp=True, user_password=True), # Example: enable SRP for secure sign-in
757
+ user_pool=user_pool,
758
+ generate_secret=True,
759
+ user_pool_client_name=user_pool_client_name,
760
+ supported_identity_providers=[cognito.UserPoolClientIdentityProvider.COGNITO],
761
+ o_auth=cognito.OAuthSettings(
762
+ flows=cognito.OAuthFlows(authorization_code_grant=True),
763
+ scopes=[cognito.OAuthScope.OPENID, cognito.OAuthScope.EMAIL, cognito.OAuthScope.PROFILE],
764
+ callback_urls=redirect_uris
765
+ )
766
+ )
767
+
768
+ CfnOutput(self, "CognitoAppClientId", value=user_pool_client.user_pool_client_id)
769
+
770
+ print(f"Created new user pool client {user_pool_client.user_pool_client_id}.")
771
+
772
+ # Add a domain to the User Pool (crucial for ALB integration)
773
+ user_pool_domain = user_pool.add_domain(
774
+ "UserPoolDomain",
775
+ cognito_domain=cognito.CognitoDomainOptions(
776
+ domain_prefix=COGNITO_USER_POOL_DOMAIN_PREFIX)
777
+ )
778
+
779
+ # Apply removal_policy to the created UserPoolDomain construct
780
+ user_pool_domain.apply_removal_policy(policy=RemovalPolicy.DESTROY)
781
+
782
+ CfnOutput(self, "CognitoUserPoolLoginUrl", value=user_pool_domain.base_url())
783
+
784
+ except Exception as e:
785
+ raise Exception("Could not handle Cognito resources due to:", e)
786
+
787
+ # --- Secrets Manager Secret ---
788
+ try:
789
+ secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
790
+ if get_context_bool(f"exists:{secret_name}"):
791
+ # Lookup by name
792
+ secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
793
+ print(f"Using existing Secret {secret_name}.")
794
+ else:
795
+ secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
796
+ secret_name=secret_name, # Explicit resource name
797
+ secret_object_value={
798
+ "REDACTION_USER_POOL_ID": SecretValue.unsafe_plain_text(user_pool.user_pool_id), # Use the CDK attribute
799
+ "REDACTION_CLIENT_ID": SecretValue.unsafe_plain_text(user_pool_client.user_pool_client_id), # Use the CDK attribute
800
+ "REDACTION_CLIENT_SECRET": user_pool_client.user_pool_client_secret # Use the CDK attribute
801
+ }
802
+ )
803
+ print(f"Created new secret {secret_name}.")
804
+
805
+ except Exception as e:
806
+ raise Exception("Could not handle Secrets Manager secret due to:", e)
807
+
808
+ # --- Fargate Task Definition ---
809
+ try:
810
+ # For task definitions, re-creating with the same logical ID creates new revisions.
811
+ # If you want to use a *specific existing revision*, you'd need to look it up by ARN.
812
+ # If you want to update the latest revision, defining it here is the standard.
813
+ # Let's assume we always define it here to get revision management.
814
+ fargate_task_definition_name = FARGATE_TASK_DEFINITION_NAME
815
+
816
+ read_only_file_system = ECS_READ_ONLY_FILE_SYSTEM == 'True'
817
+
818
+ if os.path.exists(TASK_DEFINITION_FILE_LOCATION):
819
+ with open(TASK_DEFINITION_FILE_LOCATION) as f: # Use correct path
820
+ task_def_params = json.load(f)
821
+ # Need to ensure taskRoleArn and executionRoleArn in JSON are correct ARN strings
822
+ else:
823
+ epheremal_storage_volume_name = "appEphemeralVolume"
824
+
825
+ task_def_params = {}
826
+ task_def_params['taskRoleArn'] = task_role.role_arn # Use CDK role object ARN
827
+ task_def_params['executionRoleArn'] = execution_role.role_arn # Use CDK role object ARN
828
+ task_def_params['memory'] = ECS_TASK_MEMORY_SIZE
829
+ task_def_params['cpu'] = ECS_TASK_CPU_SIZE
830
+ container_def = {
831
+ "name": full_ecr_repo_name,
832
+ "image": ecr_image_loc + ":latest",
833
+ "essential": True,
834
+ "portMappings": [{"containerPort": int(GRADIO_SERVER_PORT), "hostPort": int(GRADIO_SERVER_PORT), "protocol": "tcp", "appProtocol": "http"}],
835
+ "logConfiguration": {"logDriver": "awslogs", "options": {"awslogs-group": ECS_LOG_GROUP_NAME, "awslogs-region": AWS_REGION, "awslogs-stream-prefix": "ecs"}},
836
+ "environmentFiles": [{"value": bucket.bucket_arn + "/config.env", "type": "s3"}],
837
+ "memoryReservation": int(task_def_params['memory']) - 512, # Reserve some memory for the container
838
+ "mountPoints": [
839
+ {
840
+ "sourceVolume": epheremal_storage_volume_name,
841
+ "containerPath": "/home/user/app/logs",
842
+ "readOnly": False
843
+ },
844
+ {
845
+ "sourceVolume": epheremal_storage_volume_name,
846
+ "containerPath": "/home/user/app/feedback",
847
+ "readOnly": False
848
+ },
849
+ {
850
+ "sourceVolume": epheremal_storage_volume_name,
851
+ "containerPath": "/home/user/app/usage",
852
+ "readOnly": False
853
+ },
854
+ {
855
+ "sourceVolume": epheremal_storage_volume_name,
856
+ "containerPath": "/home/user/app/input",
857
+ "readOnly": False
858
+ },
859
+ {
860
+ "sourceVolume": epheremal_storage_volume_name,
861
+ "containerPath": "/home/user/app/output",
862
+ "readOnly": False
863
+ },
864
+ {
865
+ "sourceVolume": epheremal_storage_volume_name,
866
+ "containerPath": "/home/user/app/tmp",
867
+ "readOnly": False
868
+ },
869
+ {
870
+ "sourceVolume": epheremal_storage_volume_name,
871
+ "containerPath": "/home/user/app/config",
872
+ "readOnly": False
873
+ },
874
+ {
875
+ "sourceVolume": epheremal_storage_volume_name,
876
+ "containerPath": "/tmp/matplotlib_cache",
877
+ "readOnly": False
878
+ },
879
+ {
880
+ "sourceVolume": epheremal_storage_volume_name,
881
+ "containerPath": "/tmp",
882
+ "readOnly": False
883
+ },
884
+ {
885
+ "sourceVolume": epheremal_storage_volume_name,
886
+ "containerPath": "/var/tmp",
887
+ "readOnly": False
888
+ },
889
+ {
890
+ "sourceVolume": epheremal_storage_volume_name,
891
+ "containerPath": "/tmp/tld",
892
+ "readOnly": False
893
+ },
894
+ {
895
+ "sourceVolume": epheremal_storage_volume_name,
896
+ "containerPath": "/tmp/gradio_tmp",
897
+ "readOnly": False
898
+ }
899
+ ],
900
+ "readonlyRootFilesystem": read_only_file_system,
901
+ }
902
+ task_def_params['containerDefinitions'] = [container_def]
903
+
904
+
905
+ log_group_name_from_config=task_def_params['containerDefinitions'][0]['logConfiguration']['options']['awslogs-group']
906
+
907
+ cdk_managed_log_group = logs.LogGroup(self, "MyTaskLogGroup", # CDK Logical ID
908
+ log_group_name=log_group_name_from_config,
909
+ retention=logs.RetentionDays.ONE_MONTH, # Example: set retention
910
+ removal_policy=RemovalPolicy.DESTROY # If you want it deleted when stack is deleted
911
+ )
912
+
913
+ epheremal_storage_volume_cdk_obj = ecs.Volume(
914
+ name=epheremal_storage_volume_name
915
+ )
916
+
917
+ fargate_task_definition = ecs.FargateTaskDefinition(
918
+ self,
919
+ "FargateTaskDefinition", # Logical ID
920
+ family=fargate_task_definition_name,
921
+ cpu=int(task_def_params['cpu']),
922
+ memory_limit_mib=int(task_def_params['memory']),
923
+ task_role=task_role,
924
+ execution_role=execution_role,
925
+ runtime_platform=ecs.RuntimePlatform(
926
+ cpu_architecture=ecs.CpuArchitecture.X86_64,
927
+ operating_system_family=ecs.OperatingSystemFamily.LINUX
928
+ ),
929
+ # 1. Specify the total ephemeral storage for the task
930
+ ephemeral_storage_gib=21, # Minimum is 21 GiB
931
+ # 2. Define the volume at the task level
932
+ # This volume will use the ephemeral storage configured above.
933
+ volumes=[epheremal_storage_volume_cdk_obj]
934
+ )
935
+ print("Fargate task definition defined.")
936
+
937
+
938
+
939
+ # Add container definitions to the task definition object
940
+ if task_def_params['containerDefinitions']:
941
+ container_def_params = task_def_params['containerDefinitions'][0]
942
+
943
+ if container_def_params.get('environmentFiles'):
944
+ env_files = []
945
+ for env_file_param in container_def_params['environmentFiles']:
946
+ # Need to parse the ARN to get the bucket object and key
947
+ env_file_arn_parts = env_file_param['value'].split(":::")
948
+ bucket_name_and_key = env_file_arn_parts[-1]
949
+ env_bucket_name, env_key = bucket_name_and_key.split("/", 1)
950
+
951
+ env_file = ecs.EnvironmentFile.from_bucket(bucket, env_key)
952
+
953
+ env_files.append(env_file)
954
+
955
+ container = fargate_task_definition.add_container(
956
+ container_def_params['name'],
957
+ image=ecs.ContainerImage.from_registry(container_def_params['image']),
958
+
959
+ logging=ecs.LogDriver.aws_logs(
960
+ stream_prefix=container_def_params['logConfiguration']['options']['awslogs-stream-prefix'],
961
+ log_group=cdk_managed_log_group
962
+ ),
963
+ secrets={
964
+ "AWS_USER_POOL_ID": ecs.Secret.from_secrets_manager(secret, "REDACTION_USER_POOL_ID"),
965
+ "AWS_CLIENT_ID": ecs.Secret.from_secrets_manager(secret, "REDACTION_CLIENT_ID"),
966
+ "AWS_CLIENT_SECRET": ecs.Secret.from_secrets_manager(secret, "REDACTION_CLIENT_SECRET")
967
+ },
968
+ environment_files=env_files,
969
+ readonly_root_filesystem=read_only_file_system
970
+ )
971
+
972
+ for port_mapping in container_def_params['portMappings']:
973
+ container.add_port_mappings(
974
+ ecs.PortMapping(
975
+ container_port=int(port_mapping['containerPort']),
976
+ host_port=int(port_mapping['hostPort']),
977
+ name="port-" + str(port_mapping['containerPort']),
978
+ app_protocol=ecs.AppProtocol.http,
979
+ protocol=ecs.Protocol.TCP
980
+ )
981
+ )
982
+
983
+ container.add_port_mappings(ecs.PortMapping(
984
+ container_port=80,
985
+ host_port=80,
986
+ name="port-80",
987
+ app_protocol=ecs.AppProtocol.http,
988
+ protocol=ecs.Protocol.TCP
989
+ ))
990
+
991
+ if container_def_params.get('mountPoints'):
992
+ mount_points=[]
993
+ for mount_point in container_def_params['mountPoints']:
994
+ mount_points.append(ecs.MountPoint(container_path=mount_point['containerPath'], read_only=mount_point['readOnly'], source_volume=epheremal_storage_volume_name))
995
+ container.add_mount_points(*mount_points)
996
+
997
+ except Exception as e:
998
+ raise Exception("Could not handle Fargate task definition due to:", e)
999
+
1000
+
1001
+ # --- ECS Cluster ---
1002
+ try:
1003
+ cluster = ecs.Cluster(
1004
+ self,
1005
+ "ECSCluster", # Logical ID
1006
+ cluster_name=CLUSTER_NAME, # Explicit resource name
1007
+ enable_fargate_capacity_providers=True,
1008
+ vpc=vpc
1009
+ )
1010
+ print("Successfully created new ECS cluster")
1011
+ except Exception as e:
1012
+ raise Exception("Could not handle ECS cluster due to:", e)
1013
+
1014
+
1015
+ # --- ECS Service ---
1016
+ try:
1017
+ ecs_service_name = ECS_SERVICE_NAME
1018
+
1019
+ if ECS_USE_FARGATE_SPOT == 'True': use_fargate_spot = "FARGATE_SPOT"
1020
+ if ECS_USE_FARGATE_SPOT == 'False': use_fargate_spot = "FARGATE"
1021
+
1022
+ # Check if service exists - from_service_arn or from_service_name (needs cluster)
1023
+ try:
1024
+ # from_service_name is useful if you have the cluster object
1025
+ ecs_service = ecs.FargateService.from_service_attributes(
1026
+ self, "ECSService", # Logical ID
1027
+ cluster=cluster, # Requires the cluster object
1028
+ service_name=ecs_service_name
1029
+ )
1030
+ print(f"Using existing ECS service {ecs_service_name}.")
1031
+ except Exception:
1032
+ # Service will be created with a count of 0, because you haven't yet actually built the initial Docker container with CodeBuild
1033
+ ecs_service = ecs.FargateService(
1034
+ self,
1035
+ "ECSService", # Logical ID
1036
+ service_name=ecs_service_name, # Explicit resource name
1037
+ platform_version=ecs.FargatePlatformVersion.LATEST,
1038
+ capacity_provider_strategies=[ecs.CapacityProviderStrategy(capacity_provider=use_fargate_spot, base=0, weight=1)],
1039
+ cluster=cluster,
1040
+ task_definition=fargate_task_definition, # Link to TD
1041
+ security_groups=[ecs_security_group], # Link to SG
1042
+ vpc_subnets=ec2.SubnetSelection(subnets=self.private_subnets), # Link to subnets
1043
+ min_healthy_percent=0,
1044
+ max_healthy_percent=100,
1045
+ desired_count=0
1046
+ )
1047
+ print("Successfully created new ECS service")
1048
+
1049
+ # Note: Auto-scaling setup would typically go here if needed for the service
1050
+
1051
+ except Exception as e:
1052
+ raise Exception("Could not handle ECS service due to:", e)
1053
+
1054
+ # --- Grant Secret Read Access (Applies to both created and imported roles) ---
1055
+ try:
1056
+ secret.grant_read(task_role)
1057
+ secret.grant_read(execution_role)
1058
+ except Exception as e:
1059
+ raise Exception("Could not grant access to Secrets Manager due to:", e)
1060
+
1061
+ # --- ALB TARGET GROUPS AND LISTENERS ---
1062
+ # This section should primarily define the resources if they are managed by this stack.
1063
+ # CDK handles adding/removing targets and actions on updates.
1064
+ # If they might pre-exist outside the stack, you need lookups.
1065
+ cookie_duration = Duration.hours(12)
1066
+ target_group_name = ALB_TARGET_GROUP_NAME # Explicit resource name
1067
+ cloudfront_distribution_url = "cloudfront_placeholder.net" # Need to replace this afterwards with the actual cloudfront_distribution.domain_name
1068
+
1069
+ try:
1070
+ # --- CREATING TARGET GROUPS AND ADDING THE CLOUDFRONT LISTENER RULE ---
1071
+
1072
+ target_group = elbv2.ApplicationTargetGroup(
1073
+ self,
1074
+ "AppTargetGroup", # Logical ID
1075
+ target_group_name=target_group_name, # Explicit resource name
1076
+ port=int(GRADIO_SERVER_PORT), # Ensure port is int
1077
+ protocol=elbv2.ApplicationProtocol.HTTP,
1078
+ targets=[ecs_service], # Link to ECS Service
1079
+ stickiness_cookie_duration=cookie_duration,
1080
+ vpc=vpc, # Target Groups need VPC
1081
+ )
1082
+ print(f"ALB target group {target_group_name} defined.")
1083
+
1084
+ # First HTTP
1085
+ listener_port = 80
1086
+ # Check if Listener exists - from_listener_arn or lookup by port/ALB
1087
+
1088
+ http_listener = alb.add_listener(
1089
+ "HttpListener", # Logical ID
1090
+ port=listener_port,
1091
+ open=False, # Be cautious with open=True, usually restrict source SG
1092
+ )
1093
+ print(f"ALB listener on port {listener_port} defined.")
1094
+
1095
+
1096
+ if ACM_CERTIFICATE_ARN:
1097
+ http_listener.add_action(
1098
+ "DefaultAction", # Logical ID for the default action
1099
+ action=elbv2.ListenerAction.redirect(protocol='HTTPS',
1100
+ host='#{host}',
1101
+ port='443',
1102
+ path='/#{path}',
1103
+ query='#{query}')
1104
+ )
1105
+ else:
1106
+ if USE_CLOUDFRONT == 'True':
1107
+
1108
+ # The following default action can be added for the listener after a host header rule is added to the listener manually in the Console as suggested in the above comments.
1109
+ http_listener.add_action(
1110
+ "DefaultAction", # Logical ID for the default action
1111
+ action=elbv2.ListenerAction.fixed_response(
1112
+ status_code=403,
1113
+ content_type="text/plain",
1114
+ message_body="Access denied",
1115
+ ),
1116
+ )
1117
+
1118
+ # Add the Listener Rule for the specific CloudFront Host Header
1119
+ http_listener.add_action(
1120
+ "CloudFrontHostHeaderRule",
1121
+ action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
1122
+ priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
1123
+ conditions=[
1124
+ elbv2.ListenerCondition.host_headers([cloudfront_distribution_url]) # May have to redefine url in console afterwards if not specified in config file
1125
+ ]
1126
+ )
1127
+
1128
+ else:
1129
+ # Add the Listener Rule for the specific CloudFront Host Header
1130
+ http_listener.add_action(
1131
+ "CloudFrontHostHeaderRule",
1132
+ action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration)
1133
+ )
1134
+
1135
+ print("Added targets and actions to ALB HTTP listener.")
1136
+
1137
+ # Now the same for HTTPS if you have an ACM certificate
1138
+ if ACM_CERTIFICATE_ARN:
1139
+ listener_port_https = 443
1140
+ # Check if Listener exists - from_listener_arn or lookup by port/ALB
1141
+
1142
+ https_listener = add_alb_https_listener_with_cert(
1143
+ self,
1144
+ "MyHttpsListener", # Logical ID for the HTTPS listener
1145
+ alb,
1146
+ acm_certificate_arn=ACM_CERTIFICATE_ARN,
1147
+ default_target_group=target_group,
1148
+ enable_cognito_auth=True,
1149
+ cognito_user_pool=user_pool,
1150
+ cognito_user_pool_client=user_pool_client,
1151
+ cognito_user_pool_domain=user_pool_domain,
1152
+ listener_open_to_internet=True,
1153
+ stickiness_cookie_duration=cookie_duration
1154
+ )
1155
+
1156
+ if https_listener:
1157
+ CfnOutput(self, "HttpsListenerArn", value=https_listener.listener_arn)
1158
+
1159
+ print(f"ALB listener on port {listener_port_https} defined.")
1160
+
1161
+ # if USE_CLOUDFRONT == 'True':
1162
+ # # Add default action to the listener
1163
+ # https_listener.add_action(
1164
+ # "DefaultAction", # Logical ID for the default action
1165
+ # action=elbv2.ListenerAction.fixed_response(
1166
+ # status_code=403,
1167
+ # content_type="text/plain",
1168
+ # message_body="Access denied",
1169
+ # ),
1170
+ # )
1171
+
1172
+ # # Add the Listener Rule for the specific CloudFront Host Header
1173
+ # https_listener.add_action(
1174
+ # "CloudFrontHostHeaderRuleHTTPS",
1175
+ # action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration),
1176
+ # priority=1, # Example priority. Adjust as needed. Lower is evaluated first.
1177
+ # conditions=[
1178
+ # elbv2.ListenerCondition.host_headers([cloudfront_distribution_url])
1179
+ # ]
1180
+ # )
1181
+ # else:
1182
+ # https_listener.add_action(
1183
+ # "CloudFrontHostHeaderRuleHTTPS",
1184
+ # action=elbv2.ListenerAction.forward(target_groups=[target_group],stickiness_duration=cookie_duration))
1185
+
1186
+ print("Added targets and actions to ALB HTTPS listener.")
1187
+
1188
+ except Exception as e:
1189
+ raise Exception("Could not handle ALB target groups and listeners due to:", e)
1190
+
1191
+ # Create WAF to attach to load balancer
1192
+ try:
1193
+ web_acl_name = LOAD_BALANCER_WEB_ACL_NAME
1194
+ if get_context_bool(f"exists:{web_acl_name}"):
1195
+ # Lookup WAF ACL by ARN from context
1196
+ web_acl_arn = get_context_str(f"arn:{web_acl_name}")
1197
+ if not web_acl_arn:
1198
+ raise ValueError(f"Context value 'arn:{web_acl_name}' is required if Web ACL exists.")
1199
+
1200
+ web_acl = create_web_acl_with_common_rules(self, web_acl_name, waf_scope="REGIONAL") # Assuming it takes scope and name
1201
+ print(f"Handled ALB WAF web ACL {web_acl_name}.")
1202
+ else:
1203
+ web_acl = create_web_acl_with_common_rules(self, web_acl_name, waf_scope="REGIONAL") # Assuming it takes scope and name
1204
+ print(f"Created ALB WAF web ACL {web_acl_name}.")
1205
+
1206
+ alb_waf_association = wafv2.CfnWebACLAssociation(self, id="alb_waf_association", resource_arn=alb.load_balancer_arn, web_acl_arn=web_acl.attr_arn)
1207
+
1208
+ except Exception as e:
1209
+ raise Exception("Could not handle create ALB WAF web ACL due to:", e)
1210
+
1211
+ # --- Outputs for other stacks/regions ---
1212
+
1213
+ self.params = dict()
1214
+ self.params["alb_arn_output"] = alb.load_balancer_arn
1215
+ self.params["alb_security_group_id"] = alb_security_group.security_group_id
1216
+ self.params["alb_dns_name"] = alb.load_balancer_dns_name
1217
+
1218
+ CfnOutput(self, "AlbArnOutput",
1219
+ value=alb.load_balancer_arn,
1220
+ description="ARN of the Application Load Balancer",
1221
+ export_name=f"{self.stack_name}-AlbArn") # Export name must be unique within the account/region
1222
+
1223
+ CfnOutput(self, "AlbSecurityGroupIdOutput",
1224
+ value=alb_security_group.security_group_id,
1225
+ description="ID of the ALB's Security Group",
1226
+ export_name=f"{self.stack_name}-AlbSgId")
1227
+ CfnOutput(self, "ALBName", value=alb.load_balancer_name)
1228
+
1229
+
1230
+ CfnOutput(self, "RegionalAlbDnsName", value=alb.load_balancer_dns_name)
1231
+
1232
+ CfnOutput(self, "CognitoPoolId", value=user_pool.user_pool_id)
1233
+ # Add other outputs if needed
1234
+
1235
+ CfnOutput(self, "ECRRepoUri", value=ecr_repo.repository_uri)
1236
+
1237
+ # --- CLOUDFRONT DISTRIBUTION in separate stack (us-east-1 required) ---
1238
+ class CdkStackCloudfront(Stack):
1239
+
1240
+ def __init__(self, scope: Construct, construct_id: str, alb_arn: str, alb_sec_group_id:str, alb_dns_name:str, **kwargs) -> None:
1241
+ super().__init__(scope, construct_id, **kwargs)
1242
+
1243
+ # --- Helper to get context values ---
1244
+ def get_context_bool(key: str, default: bool = False) -> bool:
1245
+ return self.node.try_get_context(key) or default
1246
+
1247
+ def get_context_str(key: str, default: str = None) -> str:
1248
+ return self.node.try_get_context(key) or default
1249
+
1250
+ def get_context_dict(scope: Construct, key: str, default: dict = None) -> dict:
1251
+ return scope.node.try_get_context(key) or default
1252
+
1253
+ print(f"CloudFront Stack: Received ALB ARN: {alb_arn}")
1254
+ print(f"CloudFront Stack: Received ALB Security Group ID: {alb_sec_group_id}")
1255
+
1256
+ if not alb_arn:
1257
+ raise ValueError("ALB ARN must be provided to CloudFront stack")
1258
+ if not alb_sec_group_id:
1259
+ raise ValueError("ALB Security Group ID must be provided to CloudFront stack")
1260
+
1261
+ # 2. Import the ALB using its ARN
1262
+ # This imports an existing ALB as a construct in the CloudFront stack's context.
1263
+ # CloudFormation will understand this reference at deploy time.
1264
+ alb = elbv2.ApplicationLoadBalancer.from_application_load_balancer_attributes(
1265
+ self, "ImportedAlb", load_balancer_arn=alb_arn, security_group_id=alb_sec_group_id, load_balancer_dns_name=alb_dns_name
1266
+ )
1267
+
1268
+ try:
1269
+ web_acl_name = WEB_ACL_NAME
1270
+ if get_context_bool(f"exists:{web_acl_name}"):
1271
+ # Lookup WAF ACL by ARN from context
1272
+ web_acl_arn = get_context_str(f"arn:{web_acl_name}")
1273
+ if not web_acl_arn:
1274
+ raise ValueError(f"Context value 'arn:{web_acl_name}' is required if Web ACL exists.")
1275
+
1276
+ web_acl = create_web_acl_with_common_rules(self, web_acl_name) # Assuming it takes scope and name
1277
+ print(f"Handled Cloudfront WAF web ACL {web_acl_name}.")
1278
+ else:
1279
+ web_acl = create_web_acl_with_common_rules(self, web_acl_name) # Assuming it takes scope and name
1280
+ print(f"Created Cloudfront WAF web ACL {web_acl_name}.")
1281
+
1282
+
1283
+ # Add ALB as CloudFront Origin
1284
+ origin = origins.LoadBalancerV2Origin(
1285
+ alb, # Use the created or looked-up ALB object
1286
+ custom_headers={CUSTOM_HEADER: CUSTOM_HEADER_VALUE},
1287
+ origin_shield_enabled=False,
1288
+ protocol_policy=cloudfront.OriginProtocolPolicy.HTTP_ONLY,
1289
+ )
1290
+
1291
+ if CLOUDFRONT_GEO_RESTRICTION: geo_restrict = cloudfront.GeoRestriction.allowlist(CLOUDFRONT_GEO_RESTRICTION)
1292
+ else: geo_restrict = None
1293
+
1294
+ cloudfront_distribution = cloudfront.Distribution(
1295
+ self,
1296
+ "CloudFrontDistribution", # Logical ID
1297
+ comment=CLOUDFRONT_DISTRIBUTION_NAME, # Use name as comment for easier identification
1298
+ geo_restriction=geo_restrict,
1299
+ default_behavior=cloudfront.BehaviorOptions(
1300
+ origin=origin,
1301
+ viewer_protocol_policy=cloudfront.ViewerProtocolPolicy.REDIRECT_TO_HTTPS,
1302
+ allowed_methods=cloudfront.AllowedMethods.ALLOW_ALL,
1303
+ cache_policy=cloudfront.CachePolicy.CACHING_DISABLED,
1304
+ origin_request_policy=cloudfront.OriginRequestPolicy.ALL_VIEWER,
1305
+ ),
1306
+ web_acl_id=web_acl.attr_arn
1307
+ )
1308
+ print(f"Cloudfront distribution {CLOUDFRONT_DISTRIBUTION_NAME} defined.")
1309
+
1310
+ except Exception as e:
1311
+ raise Exception("Could not handle Cloudfront distribution due to:", e)
1312
+
1313
+
1314
+ # --- Outputs ---
1315
+ CfnOutput(self, "CloudFrontDistributionURL",
1316
+ value=cloudfront_distribution.domain_name)
1317
+
cdk/check_resources.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from cdk_config import CDK_PREFIX, VPC_NAME, AWS_REGION, PUBLIC_SUBNETS_TO_USE, PRIVATE_SUBNETS_TO_USE, CODEBUILD_ROLE_NAME, ECS_TASK_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ECR_CDK_REPO_NAME, CODEBUILD_PROJECT_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, WEB_ACL_NAME, CONTEXT_FILE, PUBLIC_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNET_AVAILABILITY_ZONES, CDK_FOLDER, CDK_CONFIG_PATH # Import necessary config
4
+ from cdk_functions import ( # Import your check functions (assuming they use Boto3)
5
+ get_vpc_id_by_name,
6
+ check_subnet_exists_by_name,
7
+ check_for_existing_role,
8
+ check_s3_bucket_exists,
9
+ check_ecr_repo_exists,
10
+ check_codebuild_project_exists,
11
+ check_alb_exists,
12
+ check_for_existing_user_pool,
13
+ check_for_existing_user_pool_client,
14
+ check_for_secret,
15
+ check_cloudfront_distribution_exists,
16
+ check_web_acl_exists,
17
+ _get_existing_subnets_in_vpc,
18
+ validate_subnet_creation_parameters
19
+ # Add other check functions as needed
20
+ )
21
+
22
+ from typing import List, Dict, Any
23
+
24
+ cdk_folder = CDK_FOLDER #<FULL_PATH_TO_CDK_FOLDER_HERE>
25
+
26
+ # Full path needed to find config file
27
+ os.environ["CDK_CONFIG_PATH"] = cdk_folder + CDK_CONFIG_PATH
28
+
29
+ # --- Helper to parse environment variables into lists ---
30
+ def _get_env_list(env_var_name: str) -> List[str]:
31
+ """Parses a comma-separated environment variable into a list of strings."""
32
+ value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
33
+ if not value:
34
+ return []
35
+ # Split by comma and filter out any empty strings that might result from extra commas
36
+ return [s.strip() for s in value.split(',') if s.strip()]
37
+
38
+
39
+ if PUBLIC_SUBNETS_TO_USE and not isinstance(PUBLIC_SUBNETS_TO_USE, list): PUBLIC_SUBNETS_TO_USE = _get_env_list(PUBLIC_SUBNETS_TO_USE)
40
+ if PRIVATE_SUBNETS_TO_USE and not isinstance(PRIVATE_SUBNETS_TO_USE, list): PRIVATE_SUBNETS_TO_USE = _get_env_list(PRIVATE_SUBNETS_TO_USE)
41
+ if PUBLIC_SUBNET_CIDR_BLOCKS and not isinstance(PUBLIC_SUBNET_CIDR_BLOCKS, list): PUBLIC_SUBNET_CIDR_BLOCKS = _get_env_list(PUBLIC_SUBNET_CIDR_BLOCKS)
42
+ if PUBLIC_SUBNET_AVAILABILITY_ZONES and not isinstance(PUBLIC_SUBNET_AVAILABILITY_ZONES, list): PUBLIC_SUBNET_AVAILABILITY_ZONES = _get_env_list(PUBLIC_SUBNET_AVAILABILITY_ZONES)
43
+ if PRIVATE_SUBNET_CIDR_BLOCKS and not isinstance(PRIVATE_SUBNET_CIDR_BLOCKS, list): PRIVATE_SUBNET_CIDR_BLOCKS = _get_env_list(PRIVATE_SUBNET_CIDR_BLOCKS)
44
+ if PRIVATE_SUBNET_AVAILABILITY_ZONES and not isinstance(PRIVATE_SUBNET_AVAILABILITY_ZONES, list): PRIVATE_SUBNET_AVAILABILITY_ZONES = _get_env_list(PRIVATE_SUBNET_AVAILABILITY_ZONES)
45
+
46
+ # Check for the existence of elements in your AWS environment to see if it's necessary to create new versions of the same
47
+
48
+ def check_and_set_context():
49
+ context_data = {}
50
+
51
+ # --- Find the VPC ID first ---
52
+ print("VPC_NAME:", VPC_NAME)
53
+ vpc_id, nat_gateways = get_vpc_id_by_name(VPC_NAME)
54
+
55
+ # If you expect only one, or one per AZ and you're creating one per AZ in CDK:
56
+ if nat_gateways:
57
+ # For simplicity, let's just check if *any* NAT exists in the VPC
58
+ # A more robust check would match by subnet, AZ, or a specific tag.
59
+ context_data["exists:NatGateway"] = True
60
+ context_data["id:NatGateway"] = nat_gateways[0]['NatGatewayId'] # Store the ID of the first one found
61
+ else:
62
+ context_data["exists:NatGateway"] = False
63
+ context_data["id:NatGateway"] = None
64
+
65
+ if not vpc_id:
66
+ # If the VPC doesn't exist, you might not be able to check/create subnets.
67
+ # Decide how to handle this: raise an error, set a flag, etc.
68
+ raise RuntimeError(f"Required VPC '{VPC_NAME}' not found. Cannot proceed with subnet checks.")
69
+
70
+ context_data["vpc_id"] = vpc_id # Store VPC ID in context
71
+
72
+ # SUBNET CHECKS
73
+ context_data: Dict[str, Any] = {}
74
+ all_proposed_subnets_data: List[Dict[str, str]] = []
75
+
76
+ # Flag to indicate if full validation mode (with CIDR/AZs) is active
77
+ full_validation_mode = False
78
+
79
+ # Determine if full validation mode is possible/desired
80
+ # It's 'desired' if CIDR/AZs are provided, and their lengths match the name lists.
81
+ public_ready_for_full_validation = (
82
+ len(PUBLIC_SUBNETS_TO_USE) > 0 and
83
+ len(PUBLIC_SUBNET_CIDR_BLOCKS) == len(PUBLIC_SUBNETS_TO_USE) and
84
+ len(PUBLIC_SUBNET_AVAILABILITY_ZONES) == len(PUBLIC_SUBNETS_TO_USE)
85
+ )
86
+ private_ready_for_full_validation = (
87
+ len(PRIVATE_SUBNETS_TO_USE) > 0 and
88
+ len(PRIVATE_SUBNET_CIDR_BLOCKS) == len(PRIVATE_SUBNETS_TO_USE) and
89
+ len(PRIVATE_SUBNET_AVAILABILITY_ZONES) == len(PRIVATE_SUBNETS_TO_USE)
90
+ )
91
+
92
+ # Activate full validation if *any* type of subnet (public or private) has its full details provided.
93
+ # You might adjust this logic if you require ALL subnet types to have CIDRs, or NONE.
94
+ if public_ready_for_full_validation or private_ready_for_full_validation:
95
+ full_validation_mode = True
96
+
97
+ # If some are ready but others aren't, print a warning or raise an error based on your strictness
98
+ if public_ready_for_full_validation and not private_ready_for_full_validation and PRIVATE_SUBNETS_TO_USE:
99
+ print("Warning: Public subnets have CIDRs/AZs, but private subnets do not. Only public will be fully validated/created with CIDRs.")
100
+ if private_ready_for_full_validation and not public_ready_for_full_validation and PUBLIC_SUBNETS_TO_USE:
101
+ print("Warning: Private subnets have CIDRs/AZs, but public subnets do not. Only private will be fully validated/created with CIDRs.")
102
+
103
+ # Prepare data for validate_subnet_creation_parameters for all subnets that have full details
104
+ if public_ready_for_full_validation:
105
+ for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
106
+ all_proposed_subnets_data.append({
107
+ 'name': name,
108
+ 'cidr': PUBLIC_SUBNET_CIDR_BLOCKS[i],
109
+ 'az': PUBLIC_SUBNET_AVAILABILITY_ZONES[i]
110
+ })
111
+ if private_ready_for_full_validation:
112
+ for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
113
+ all_proposed_subnets_data.append({
114
+ 'name': name,
115
+ 'cidr': PRIVATE_SUBNET_CIDR_BLOCKS[i],
116
+ 'az': PRIVATE_SUBNET_AVAILABILITY_ZONES[i]
117
+ })
118
+
119
+
120
+ print(f"Target VPC ID for Boto3 lookup: {vpc_id}")
121
+
122
+ # Fetch all existing subnets in the target VPC once to avoid repeated API calls
123
+ try:
124
+ existing_aws_subnets = _get_existing_subnets_in_vpc(vpc_id)
125
+ except Exception as e:
126
+ print(f"Failed to fetch existing VPC subnets. Aborting. Error: {e}")
127
+ raise SystemExit(1) # Exit immediately if we can't get baseline data
128
+
129
+ print("\n--- Running Name-Only Subnet Existence Check Mode ---")
130
+ # Fallback: check only by name using the existing data
131
+ checked_public_subnets = {}
132
+ if PUBLIC_SUBNETS_TO_USE:
133
+ for subnet_name in PUBLIC_SUBNETS_TO_USE:
134
+ print("subnet_name:", subnet_name)
135
+ exists, subnet_id = check_subnet_exists_by_name(subnet_name, existing_aws_subnets)
136
+ checked_public_subnets[subnet_name] = {"exists": exists, "id": subnet_id}
137
+
138
+ # If the subnet exists, remove it from the proposed subnets list
139
+ if checked_public_subnets[subnet_name]["exists"] == True:
140
+ all_proposed_subnets_data = [
141
+ subnet for subnet in all_proposed_subnets_data
142
+ if subnet['name'] != subnet_name
143
+ ]
144
+
145
+ context_data["checked_public_subnets"] = checked_public_subnets
146
+
147
+ checked_private_subnets = {}
148
+ if PRIVATE_SUBNETS_TO_USE:
149
+ for subnet_name in PRIVATE_SUBNETS_TO_USE:
150
+ print("subnet_name:", subnet_name)
151
+ exists, subnet_id = check_subnet_exists_by_name(subnet_name, existing_aws_subnets)
152
+ checked_private_subnets[subnet_name] = {"exists": exists, "id": subnet_id}
153
+
154
+ # If the subnet exists, remove it from the proposed subnets list
155
+ if checked_private_subnets[subnet_name]["exists"] == True:
156
+ all_proposed_subnets_data = [
157
+ subnet for subnet in all_proposed_subnets_data
158
+ if subnet['name'] != subnet_name
159
+ ]
160
+
161
+ context_data["checked_private_subnets"] = checked_private_subnets
162
+
163
+
164
+
165
+ print("\nName-only existence subnet check complete.\n")
166
+
167
+ if full_validation_mode:
168
+ print("\n--- Running in Full Subnet Validation Mode (CIDR/AZs provided) ---")
169
+ try:
170
+ validate_subnet_creation_parameters(vpc_id, all_proposed_subnets_data, existing_aws_subnets)
171
+ print("\nPre-synth validation successful. Proceeding with CDK synth.\n")
172
+
173
+ # Populate context_data for downstream CDK construct creation
174
+ context_data["public_subnets_to_create"] = []
175
+ if public_ready_for_full_validation:
176
+ for i, name in enumerate(PUBLIC_SUBNETS_TO_USE):
177
+ context_data["public_subnets_to_create"].append({
178
+ 'name': name,
179
+ 'cidr': PUBLIC_SUBNET_CIDR_BLOCKS[i],
180
+ 'az': PUBLIC_SUBNET_AVAILABILITY_ZONES[i],
181
+ 'is_public': True
182
+ })
183
+ context_data["private_subnets_to_create"] = []
184
+ if private_ready_for_full_validation:
185
+ for i, name in enumerate(PRIVATE_SUBNETS_TO_USE):
186
+ context_data["private_subnets_to_create"].append({
187
+ 'name': name,
188
+ 'cidr': PRIVATE_SUBNET_CIDR_BLOCKS[i],
189
+ 'az': PRIVATE_SUBNET_AVAILABILITY_ZONES[i],
190
+ 'is_public': False
191
+ })
192
+
193
+ except (ValueError, Exception) as e:
194
+ print(f"\nFATAL ERROR: Subnet parameter validation failed: {e}\n")
195
+ raise SystemExit(1) # Exit if validation fails
196
+
197
+ # Example checks and setting context values
198
+ # IAM Roles
199
+ role_name = CODEBUILD_ROLE_NAME
200
+ exists, _, _ = check_for_existing_role(role_name)
201
+ context_data[f"exists:{role_name}"] = exists # Use boolean
202
+ if exists:
203
+ _, role_arn, _ = check_for_existing_role(role_name) # Get ARN if needed
204
+ context_data[f"arn:{role_name}"] = role_arn
205
+
206
+ role_name = ECS_TASK_ROLE_NAME
207
+ exists, _, _ = check_for_existing_role(role_name)
208
+ context_data[f"exists:{role_name}"] = exists
209
+ if exists:
210
+ _, role_arn, _ = check_for_existing_role(role_name)
211
+ context_data[f"arn:{role_name}"] = role_arn
212
+
213
+ role_name = ECS_TASK_EXECUTION_ROLE_NAME
214
+ exists, _, _ = check_for_existing_role(role_name)
215
+ context_data[f"exists:{role_name}"] = exists
216
+ if exists:
217
+ _, role_arn, _ = check_for_existing_role(role_name)
218
+ context_data[f"arn:{role_name}"] = role_arn
219
+
220
+ # S3 Buckets
221
+ bucket_name = S3_LOG_CONFIG_BUCKET_NAME
222
+ exists, _ = check_s3_bucket_exists(bucket_name)
223
+ context_data[f"exists:{bucket_name}"] = exists
224
+ if exists:
225
+ # You might not need the ARN if using from_bucket_name
226
+ pass
227
+
228
+ output_bucket_name = S3_OUTPUT_BUCKET_NAME
229
+ exists, _ = check_s3_bucket_exists(output_bucket_name)
230
+ context_data[f"exists:{output_bucket_name}"] = exists
231
+ if exists:
232
+ pass
233
+
234
+ # ECR Repository
235
+ repo_name = ECR_CDK_REPO_NAME
236
+ exists, _ = check_ecr_repo_exists(repo_name)
237
+ context_data[f"exists:{repo_name}"] = exists
238
+ if exists:
239
+ pass # from_repository_name is sufficient
240
+
241
+ # CodeBuild Project
242
+ project_name = CODEBUILD_PROJECT_NAME
243
+ exists, _ = check_codebuild_project_exists(project_name)
244
+ context_data[f"exists:{project_name}"] = exists
245
+ if exists:
246
+ # Need a way to get the ARN from the check function
247
+ _, project_arn = check_codebuild_project_exists(project_name) # Assuming it returns ARN
248
+ context_data[f"arn:{project_name}"] = project_arn
249
+
250
+ # ALB (by name lookup)
251
+ alb_name = ALB_NAME
252
+ exists, _ = check_alb_exists(alb_name, region_name=AWS_REGION)
253
+ context_data[f"exists:{alb_name}"] = exists
254
+ if exists:
255
+ _, alb_object = check_alb_exists(alb_name, region_name=AWS_REGION) # Assuming check returns object
256
+ print("alb_object:", alb_object)
257
+ context_data[f"arn:{alb_name}"] = alb_object['LoadBalancerArn']
258
+
259
+
260
+ # Cognito User Pool (by name)
261
+ user_pool_name = COGNITO_USER_POOL_NAME
262
+ exists, user_pool_id, _ = check_for_existing_user_pool(user_pool_name)
263
+ context_data[f"exists:{user_pool_name}"] = exists
264
+ if exists:
265
+ context_data[f"id:{user_pool_name}"] = user_pool_id
266
+
267
+ # Cognito User Pool Client (by name and pool ID) - requires User Pool ID from check
268
+ if user_pool_id:
269
+ user_pool_id_for_client_check = user_pool_id #context_data.get(f"id:{user_pool_name}") # Use ID from context
270
+ user_pool_client_name = COGNITO_USER_POOL_CLIENT_NAME
271
+ if user_pool_id_for_client_check:
272
+ exists, client_id, _ = check_for_existing_user_pool_client(user_pool_client_name, user_pool_id_for_client_check)
273
+ context_data[f"exists:{user_pool_client_name}"] = exists
274
+ if exists:
275
+ context_data[f"id:{user_pool_client_name}"] = client_id
276
+
277
+ # Secrets Manager Secret (by name)
278
+ secret_name = COGNITO_USER_POOL_CLIENT_SECRET_NAME
279
+ exists, _ = check_for_secret(secret_name)
280
+ context_data[f"exists:{secret_name}"] = exists
281
+ # You might not need the ARN if using from_secret_name_v2
282
+
283
+
284
+ # WAF Web ACL (by name and scope)
285
+ web_acl_name = WEB_ACL_NAME
286
+ exists, _ = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT") # Assuming check returns object
287
+ context_data[f"exists:{web_acl_name}"] = exists
288
+ if exists:
289
+ _, existing_web_acl = check_web_acl_exists(web_acl_name, scope="CLOUDFRONT")
290
+ context_data[f"arn:{web_acl_name}"] = existing_web_acl.attr_arn
291
+
292
+ # Write the context data to the file
293
+ with open(CONTEXT_FILE, "w") as f:
294
+ json.dump(context_data, f, indent=2)
295
+
296
+ print(f"Context data written to {CONTEXT_FILE}")
297
+
cdk/post_cdk_build_quickstart.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from cdk_config import CODEBUILD_PROJECT_NAME, S3_LOG_CONFIG_BUCKET_NAME, CLUSTER_NAME, ECS_SERVICE_NAME
3
+ from cdk_functions import start_codebuild_build, upload_file_to_s3, start_ecs_task, create_basic_config_env
4
+ from tqdm import tqdm
5
+
6
+ # Create basic config.env file that user can use to run the app later. Input is the folder it is saved into.
7
+ create_basic_config_env("config")
8
+
9
+ # Start codebuild build
10
+ print("Starting CodeBuild project.")
11
+ start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
12
+
13
+ # Upload config.env file to S3 bucket
14
+ upload_file_to_s3(local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME)
15
+
16
+ total_seconds = 480 # 8 minutes * 60 seconds/minute
17
+ update_interval = 1 # Update every second
18
+
19
+ print("Waiting eight minutes for the CodeBuild container to build.")
20
+
21
+ # tqdm iterates over a range, and you perform a small sleep in each iteration
22
+ for i in tqdm(range(total_seconds), desc="Building container"):
23
+ time.sleep(update_interval)
24
+
25
+ # Start task on ECS
26
+ print("Starting ECS task")
27
+ start_ecs_task(cluster_name=CLUSTER_NAME, service_name=ECS_SERVICE_NAME)
cdk/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ aws-cdk-lib==2.200.2
2
+ boto3==1.38.35
3
+ pandas==2.2.3
4
+ nodejs==0.1.1
5
+ python-dotenv==1.0.1
index.qmd ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Home"
3
+ ---
4
+
5
+ version: 0.7.0
6
+
7
+ Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
8
+
9
+ Navigate through the sections to learn how to install, use, and manage the application. Below is a brief introduction to the app.
10
+
11
+ ## Document redaction
12
+
13
+ Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](src/user_guide.qmd) for a walkthrough on how to use the app.
14
+
15
+ ![Handwriting and signatures redacted example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/refs/heads/main/review_redactions/Signatures%20and%20handwriting%20found.PNG)
16
+
17
+ To identify text in documents, the app provides several options. 'Local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. The app then identifies personal information to redaction. The 'Local' is based on spaCy, is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
18
+
19
+ After redaction, suggested redactions can be reviewed and modified on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
20
+
21
+ NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
22
+
23
+
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.6.8"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
@@ -23,8 +23,8 @@ dependencies = [
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.29.1",
27
- "boto3==1.38.4",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
30
  "Faker==36.1.1",
@@ -39,7 +39,7 @@ dependencies = [
39
  ]
40
 
41
  [project.urls]
42
- Homepage = "https://seanpedrick-case.github.io/doc_redaction/README.html"
43
  repository = "https://github.com/seanpedrick-case/doc_redaction"
44
 
45
  [project.optional-dependencies]
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "0.7.0"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.34.0",
27
+ "boto3==1.38.35",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
30
  "Faker==36.1.1",
 
39
  ]
40
 
41
  [project.urls]
42
+ Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
43
  repository = "https://github.com/seanpedrick-case/doc_redaction"
44
 
45
  [project.optional-dependencies]
requirements.txt CHANGED
@@ -10,8 +10,8 @@ pandas==2.2.3
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.29.1
14
- boto3==1.38.4
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
17
  Faker==36.1.1
 
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.34.0
14
+ boto3==1.38.35
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
17
  Faker==36.1.1
src/app_settings.qmd ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "App settings management guide"
3
+ format:
4
+ html:
5
+ toc: true # Enable the table of contents
6
+ toc-depth: 3 # Include headings up to level 2 (##)
7
+ toc-title: "On this page" # Optional: Title for your TOC
8
+ ---
9
+
10
+ Settings for the redaction app can be set from outside by changing values in the `config.env` file stored in your local config folder, or in S3 if running on AWS. This guide provides an overview of how to configure the application using environment variables. The application loads configurations using `os.environ.get()`. It first attempts to load variables from the file specified by `APP_CONFIG_PATH` (which defaults to `config/app_config.env`). If `AWS_CONFIG_PATH` is also set (e.g., to `config/aws_config.env`), variables are loaded from that file as well. Environment variables set directly in the system will always take precedence over those defined in these `.env` files.
11
+
12
+ ## App Configuration File (config.env)
13
+
14
+ This section details variables related to the main application configuration file.
15
+
16
+ * **`APP_CONFIG_PATH`**
17
+ * **Description:** Specifies the path to the application configuration `.env` file. This file contains various settings that control the application's behavior.
18
+ * **Default Value:** `config/app_config.env`
19
+ * **Configuration:** Set as an environment variable directly. This variable defines where to load other application configurations, so it cannot be set within `config/app_config.env` itself.
20
+
21
+ ## AWS Options
22
+
23
+ This section covers configurations related to AWS services used by the application.
24
+
25
+ * **`AWS_CONFIG_PATH`**
26
+ * **Description:** Specifies the path to the AWS configuration `.env` file. This file is intended to store AWS credentials and specific settings.
27
+ * **Default Value:** `''` (empty string)
28
+ * **Configuration:** Set as an environment variable directly. This variable defines an additional source for AWS-specific configurations.
29
+
30
+ * **`RUN_AWS_FUNCTIONS`**
31
+ * **Description:** Enables or disables AWS-specific functionalities within the application. Set to `"1"` to enable and `"0"` to disable.
32
+ * **Default Value:** `"0"`
33
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
34
+
35
+ * **`AWS_REGION`**
36
+ * **Description:** Defines the AWS region where services like S3, Cognito, and Textract are located.
37
+ * **Default Value:** `''`
38
+ * **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
39
+
40
+ * **`AWS_CLIENT_ID`**
41
+ * **Description:** The client ID for AWS Cognito, used for user authentication.
42
+ * **Default Value:** `''`
43
+ * **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
44
+
45
+ * **`AWS_CLIENT_SECRET`**
46
+ * **Description:** The client secret for AWS Cognito, used in conjunction with the client ID for authentication.
47
+ * **Default Value:** `''`
48
+ * **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
49
+
50
+ * **`AWS_USER_POOL_ID`**
51
+ * **Description:** The user pool ID for AWS Cognito, identifying the user directory.
52
+ * **Default Value:** `''`
53
+ * **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
54
+
55
+ * **`AWS_ACCESS_KEY`**
56
+ * **Description:** The AWS access key ID for programmatic access to AWS services.
57
+ * **Default Value:** `''` (Note: Often found in the environment or AWS credentials file.)
58
+ * **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured). It's also commonly configured via shared AWS credentials files or IAM roles.
59
+
60
+ * **`AWS_SECRET_KEY`**
61
+ * **Description:** The AWS secret access key corresponding to the AWS access key ID.
62
+ * **Default Value:** `''` (Note: Often found in the environment or AWS credentials file.)
63
+ * **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured). It's also commonly configured via shared AWS credentials files or IAM roles.
64
+
65
+ * **`DOCUMENT_REDACTION_BUCKET`**
66
+ * **Description:** The name of the S3 bucket used for storing documents related to the redaction process.
67
+ * **Default Value:** `''`
68
+ * **Configuration:** Set as an environment variable directly, or include in `config/aws_config.env` (if `AWS_CONFIG_PATH` is configured).
69
+
70
+ * **`CUSTOM_HEADER`**
71
+ * **Description:** Specifies a custom header name to be included in requests, often used for services like AWS CloudFront.
72
+ * **Default Value:** `''`
73
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
74
+
75
+ * **`CUSTOM_HEADER_VALUE`**
76
+ * **Description:** The value for the custom header specified by `CUSTOM_HEADER`.
77
+ * **Default Value:** `''`
78
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
79
+
80
+ ## Image Options
81
+
82
+ Settings related to image processing within the application.
83
+
84
+ * **`IMAGES_DPI`**
85
+ * **Description:** Dots Per Inch (DPI) setting for image processing, affecting the resolution and quality of processed images.
86
+ * **Default Value:** `'300.0'`
87
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
88
+
89
+ * **`LOAD_TRUNCATED_IMAGES`**
90
+ * **Description:** Controls whether the application attempts to load truncated images. Set to `'True'` to enable.
91
+ * **Default Value:** `'True'`
92
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
93
+
94
+ * **`MAX_IMAGE_PIXELS`**
95
+ * **Description:** Sets the maximum number of pixels for an image that the application will process. Leave blank for no limit. This can help prevent issues with very large images.
96
+ * **Default Value:** `''`
97
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
98
+
99
+ ## File I/O Options
100
+
101
+ Configuration for input and output file handling.
102
+
103
+ * **`SESSION_OUTPUT_FOLDER`**
104
+ * **Description:** If set to `'True'`, the application will save output and input files into session-specific subfolders, helping to organise files from different user sessions.
105
+ * **Default Value:** `'False'`
106
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
107
+
108
+ * **`GRADIO_OUTPUT_FOLDER`** (aliased as `OUTPUT_FOLDER`)
109
+ * **Description:** Specifies the default output folder for files generated by Gradio components. Can be set to "TEMP" to use a temporary directory.
110
+ * **Default Value:** `'output/'`
111
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
112
+
113
+ * **`GRADIO_INPUT_FOLDER`** (aliased as `INPUT_FOLDER`)
114
+ * **Description:** Specifies the default input folder for files used by Gradio components. Can be set to "TEMP" to use a temporary directory.
115
+ * **Default Value:** `'input/'`
116
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
117
+
118
+ ## Logging Options
119
+
120
+ Settings for configuring application logging, including log formats and storage locations.
121
+
122
+ * **`SAVE_LOGS_TO_CSV`**
123
+ * **Description:** Enables or disables saving logs to CSV files. Set to `'True'` to enable.
124
+ * **Default Value:** `'True'`
125
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
126
+
127
+ * **`USE_LOG_SUBFOLDERS`**
128
+ * **Description:** If enabled (`'True'`), logs will be stored in subfolders based on date and hostname, aiding in log organisation.
129
+ * **Default Value:** `'True'`
130
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
131
+
132
+ * **`FEEDBACK_LOGS_FOLDER`**
133
+ * **Description:** Specifies the base folder for storing feedback logs. If `USE_LOG_SUBFOLDERS` is true, date/hostname subfolders will be created within this folder.
134
+ * **Default Value:** `'feedback/'`
135
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
136
+
137
+ * **`ACCESS_LOGS_FOLDER`**
138
+ * **Description:** Specifies the base folder for storing access logs. If `USE_LOG_SUBFOLDERS` is true, date/hostname subfolders will be created within this folder.
139
+ * **Default Value:** `'logs/'`
140
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
141
+
142
+ * **`USAGE_LOGS_FOLDER`**
143
+ * **Description:** Specifies the base folder for storing usage logs. If `USE_LOG_SUBFOLDERS` is true, date/hostname subfolders will be created within this folder.
144
+ * **Default Value:** `'usage/'`
145
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
146
+
147
+ * **`DISPLAY_FILE_NAMES_IN_LOGS`**
148
+ * **Description:** If set to `'True'`, file names will be included in the log entries.
149
+ * **Default Value:** `'False'`
150
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
151
+
152
+ * **`CSV_ACCESS_LOG_HEADERS`**
153
+ * **Description:** Defines custom headers for CSV access logs. If left blank, component labels will be used as headers.
154
+ * **Default Value:** `''`
155
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
156
+
157
+ * **`CSV_FEEDBACK_LOG_HEADERS`**
158
+ * **Description:** Defines custom headers for CSV feedback logs. If left blank, component labels will be used as headers.
159
+ * **Default Value:** `''`
160
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
161
+
162
+ * **`CSV_USAGE_LOG_HEADERS`**
163
+ * **Description:** Defines custom headers for CSV usage logs.
164
+ * **Default Value:** A predefined list of header names. Refer to `tools/config.py` for the complete list.
165
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
166
+
167
+ * **`SAVE_LOGS_TO_DYNAMODB`**
168
+ * **Description:** Enables or disables saving logs to AWS DynamoDB. Set to `'True'` to enable. Requires appropriate AWS setup.
169
+ * **Default Value:** `'False'`
170
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
171
+
172
+ * **`ACCESS_LOG_DYNAMODB_TABLE_NAME`**
173
+ * **Description:** The name of the DynamoDB table used for storing access logs.
174
+ * **Default Value:** `'redaction_access_log'`
175
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
176
+
177
+ * **`DYNAMODB_ACCESS_LOG_HEADERS`**
178
+ * **Description:** Specifies the headers (attributes) for the DynamoDB access log table.
179
+ * **Default Value:** `''`
180
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
181
+
182
+ * **`FEEDBACK_LOG_DYNAMODB_TABLE_NAME`**
183
+ * **Description:** The name of the DynamoDB table used for storing feedback logs.
184
+ * **Default Value:** `'redaction_feedback'`
185
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
186
+
187
+ * **`DYNAMODB_FEEDBACK_LOG_HEADERS`**
188
+ * **Description:** Specifies the headers (attributes) for the DynamoDB feedback log table.
189
+ * **Default Value:** `''`
190
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
191
+
192
+ * **`USAGE_LOG_DYNAMODB_TABLE_NAME`**
193
+ * **Description:** The name of the DynamoDB table used for storing usage logs.
194
+ * **Default Value:** `'redaction_usage'`
195
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
196
+
197
+ * **`DYNAMODB_USAGE_LOG_HEADERS`**
198
+ * **Description:** Specifies the headers (attributes) for the DynamoDB usage log table.
199
+ * **Default Value:** `''`
200
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
201
+
202
+ * **`LOGGING`**
203
+ * **Description:** Enables or disables general console logging. Set to `'True'` to enable.
204
+ * **Default Value:** `'False'`
205
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
206
+
207
+ * **`LOG_FILE_NAME`**
208
+ * **Description:** Specifies the name for the CSV log file if `SAVE_LOGS_TO_CSV` is enabled.
209
+ * **Default Value:** `'log.csv'`
210
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
211
+
212
+ ## Redaction Options
213
+
214
+ Configurations related to the text redaction process, including PII detection models and external tool paths.
215
+
216
+ * **`TESSERACT_FOLDER`**
217
+ * **Description:** Path to the local Tesseract OCR installation folder. Only required if Tesseract is not in path, or you are running a version of the app as an .exe installed with Pyinstaller. Gives the path to the local Tesseract OCR model for text extraction.
218
+ * **Default Value:** `""` (empty string)
219
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
220
+
221
+ * **`POPPLER_FOLDER`**
222
+ * **Description:** Path to the local Poppler installation's `bin` folder. Only required if Tesseract is not in path, or you are running a version of the app as an .exe installed with Pyinstaller. Poppler is used for PDF processing.
223
+ * **Default Value:** `""` (empty string)
224
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
225
+
226
+ * **`SELECTABLE_TEXT_EXTRACT_OPTION`**
227
+ * **Description:** Display name in the UI for the text extraction method that processes selectable text directly from PDFs.
228
+ * **Default Value:** `"Local model - selectable text"`
229
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
230
+
231
+ * **`TESSERACT_TEXT_EXTRACT_OPTION`**
232
+ * **Description:** Display name in the UI for the text extraction method using local Tesseract OCR (for PDFs without selectable text).
233
+ * **Default Value:** `"Local OCR model - PDFs without selectable text"`
234
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
235
+
236
+ * **`TEXTRACT_TEXT_EXTRACT_OPTION`**
237
+ * **Description:** Display name in the UI for the text extraction method using AWS Textract service.
238
+ * **Default Value:** `"AWS Textract service - all PDF types"`
239
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
240
+
241
+ * **`NO_REDACTION_PII_OPTION`**
242
+ * **Description:** Display name in the UI for the option to only extract text without performing any PII detection or redaction.
243
+ * **Default Value:** `"Only extract text (no redaction)"`
244
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
245
+
246
+ * **`LOCAL_PII_OPTION`**
247
+ * **Description:** Display name in the UI for the PII detection method using a local model.
248
+ * **Default Value:** `"Local"`
249
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
250
+
251
+ * **`AWS_PII_OPTION`**
252
+ * **Description:** Display name in the UI for the PII detection method using AWS Comprehend.
253
+ * **Default Value:** `"AWS Comprehend"`
254
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
255
+
256
+ * **`SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS`**
257
+ * **Description:** Controls whether local text extraction options (selectable text, Tesseract) are shown in the UI. Set to `'True'` to show.
258
+ * **Default Value:** `'True'`
259
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
260
+
261
+ * **`SHOW_AWS_TEXT_EXTRACTION_OPTIONS`**
262
+ * **Description:** Controls whether AWS Textract text extraction option is shown in the UI. Set to `'True'` to show.
263
+ * **Default Value:** `'True'`
264
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
265
+
266
+ * **`DEFAULT_TEXT_EXTRACTION_MODEL`**
267
+ * **Description:** Sets the default text extraction model selected in the UI. Defaults to `TEXTRACT_TEXT_EXTRACT_OPTION` if AWS options are shown; otherwise, defaults to `SELECTABLE_TEXT_EXTRACT_OPTION`.
268
+ * **Default Value:** Value of `TEXTRACT_TEXT_EXTRACT_OPTION` if `SHOW_AWS_TEXT_EXTRACTION_OPTIONS` is True, else value of `SELECTABLE_TEXT_EXTRACT_OPTION`.
269
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the text extraction option display names.
270
+
271
+ * **`SHOW_LOCAL_PII_DETECTION_OPTIONS`**
272
+ * **Description:** Controls whether the local PII detection option is shown in the UI. Set to `'True'` to show.
273
+ * **Default Value:** `'True'`
274
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
275
+
276
+ * **`SHOW_AWS_PII_DETECTION_OPTIONS`**
277
+ * **Description:** Controls whether the AWS Comprehend PII detection option is shown in the UI. Set to `'True'` to show.
278
+ * **Default Value:** `'True'`
279
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
280
+
281
+ * **`DEFAULT_PII_DETECTION_MODEL`**
282
+ * **Description:** Sets the default PII detection model selected in the UI. Defaults to `AWS_PII_OPTION` if AWS options are shown; otherwise, defaults to `LOCAL_PII_OPTION`.
283
+ * **Default Value:** Value of `AWS_PII_OPTION` if `SHOW_AWS_PII_DETECTION_OPTIONS` is True, else value of `LOCAL_PII_OPTION`.
284
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the PII detection option display names.
285
+
286
+ * **`CHOSEN_COMPREHEND_ENTITIES`**
287
+ * **Description:** A list of AWS Comprehend PII entity types to be redacted when using AWS Comprehend.
288
+ * **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
289
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
290
+
291
+ * **`FULL_COMPREHEND_ENTITY_LIST`**
292
+ * **Description:** The complete list of PII entity types supported by AWS Comprehend that can be selected for redaction.
293
+ * **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
294
+ * **Configuration:** This is typically an informational variable reflecting the capabilities of AWS Comprehend and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_COMPREHEND_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
295
+
296
+ * **`CHOSEN_REDACT_ENTITIES`**
297
+ * **Description:** A list of local PII entity types to be redacted when using the local PII detection model.
298
+ * **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
299
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
300
+
301
+ * **`FULL_ENTITY_LIST`**
302
+ * **Description:** The complete list of PII entity types supported by the local PII detection model that can be selected for redaction.
303
+ * **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
304
+ * **Configuration:** This is typically an informational variable reflecting the capabilities of the local model and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_REDACT_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
305
+
306
+ * **`PAGE_BREAK_VALUE`**
307
+ * **Description:** Defines a page count after which a function might restart. (Note: Currently not activated).
308
+ * **Default Value:** `'99999'`
309
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
310
+
311
+ * **`MAX_TIME_VALUE`**
312
+ * **Description:** Specifies the maximum time (in arbitrary units, likely seconds or milliseconds depending on implementation) for a process before it might be timed out.
313
+ * **Default Value:** `'999999'`
314
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
315
+
316
+ * **`CUSTOM_BOX_COLOUR`**
317
+ * **Description:** Allows specifying a custom color for the redaction boxes drawn on documents (e.g., "grey", "red", "#FF0000"). If empty, a default color is used.
318
+ * **Default Value:** `""` (empty string)
319
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
320
+
321
+ * **`REDACTION_LANGUAGE`**
322
+ * **Description:** Specifies the language for redaction processing. Currently, only "en" (English) is supported.
323
+ * **Default Value:** `"en"`
324
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
325
+
326
+ * **`RETURN_PDF_END_OF_REDACTION`**
327
+ * **Description:** If set to `'True'`, the application will return a PDF document at the end of the redaction task.
328
+ * **Default Value:** `"True"`
329
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
330
+
331
+ * **`COMPRESS_REDACTED_PDF`**
332
+ * **Description:** If set to `'True'`, the redacted PDF output will be compressed. This can reduce file size but may cause issues on systems with low memory.
333
+ * **Default Value:** `"False"`
334
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
335
+
336
+ ## App Run Options
337
+
338
+ General runtime configurations for the application.
339
+
340
+ * **`TLDEXTRACT_CACHE`**
341
+ * **Description:** Path to the cache file used by the `tldextract` library, which helps in accurately extracting top-level domains (TLDs) from URLs.
342
+ * **Default Value:** `'tld/.tld_set_snapshot'`
343
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
344
+
345
+ * **`COGNITO_AUTH`**
346
+ * **Description:** Enables or disables AWS Cognito authentication for the application. Set to `'1'` to enable.
347
+ * **Default Value:** `'0'`
348
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
349
+
350
+ * **`RUN_DIRECT_MODE`**
351
+ * **Description:** If set to `'1'`, runs the application in a "direct mode", which might alter certain behaviors (e.g., UI elements, processing flow).
352
+ * **Default Value:** `'0'`
353
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
354
+
355
+ * **`MAX_QUEUE_SIZE`**
356
+ * **Description:** The maximum number of requests that can be queued in the Gradio interface.
357
+ * **Default Value:** `'5'` (integer)
358
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
359
+
360
+ * **`MAX_FILE_SIZE`**
361
+ * **Description:** Maximum file size allowed for uploads (e.g., "250mb", "1gb").
362
+ * **Default Value:** `'250mb'`
363
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
364
+
365
+ * **`GRADIO_SERVER_PORT`**
366
+ * **Description:** The network port on which the Gradio server will listen.
367
+ * **Default Value:** `'7860'` (integer)
368
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
369
+
370
+ * **`ROOT_PATH`**
371
+ * **Description:** The root path for the application, useful if running behind a reverse proxy (e.g., `/app`).
372
+ * **Default Value:** `''` (empty string)
373
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
374
+
375
+ * **`DEFAULT_CONCURRENCY_LIMIT`**
376
+ * **Description:** The default concurrency limit for Gradio event handlers, controlling how many requests can be processed simultaneously.
377
+ * **Default Value:** `'3'`
378
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
379
+
380
+ * **`GET_DEFAULT_ALLOW_LIST`**
381
+ * **Description:** If set, enables the use of a default allow list for user access or specific functionalities. The exact behavior depends on application logic.
382
+ * **Default Value:** `''` (empty string)
383
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
384
+
385
+ * **`ALLOW_LIST_PATH`**
386
+ * **Description:** Path to a local CSV file containing an allow list (e.g., `config/default_allow_list.csv`).
387
+ * **Default Value:** `''` (empty string)
388
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
389
+
390
+ * **`S3_ALLOW_LIST_PATH`**
391
+ * **Description:** Path to an allow list CSV file stored in an S3 bucket (e.g., `default_allow_list.csv`). Requires `DOCUMENT_REDACTION_BUCKET` to be set.
392
+ * **Default Value:** `''` (empty string)
393
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
394
+
395
+ * **`FILE_INPUT_HEIGHT`**
396
+ * **Description:** Sets the height (in pixels or other CSS unit) of the file input component in the Gradio UI.
397
+ * **Default Value:** `'200'`
398
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
399
+
400
+ ## Cost Code Options
401
+
402
+ Settings related to tracking and applying cost codes for application usage.
403
+
404
+ * **`SHOW_COSTS`**
405
+ * **Description:** If set to `'True'`, cost-related information will be displayed in the UI.
406
+ * **Default Value:** `'False'`
407
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
408
+
409
+ * **`GET_COST_CODES`**
410
+ * **Description:** Enables fetching and using cost codes within the application. Set to `'True'` to enable.
411
+ * **Default Value:** `'False'`
412
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
413
+
414
+ * **`DEFAULT_COST_CODE`**
415
+ * **Description:** Specifies a default cost code to be used if cost codes are enabled but none is selected by the user.
416
+ * **Default Value:** `''` (empty string)
417
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
418
+
419
+ * **`COST_CODES_PATH`**
420
+ * **Description:** Path to a local CSV file containing available cost codes (e.g., `config/COST_CENTRES.csv`).
421
+ * **Default Value:** `''` (empty string)
422
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
423
+
424
+ * **`S3_COST_CODES_PATH`**
425
+ * **Description:** Path to a cost codes CSV file stored in an S3 bucket (e.g., `COST_CENTRES.csv`). Requires `DOCUMENT_REDACTION_BUCKET` to be set.
426
+ * **Default Value:** `''` (empty string)
427
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
428
+
429
+ * **`ENFORCE_COST_CODES`**
430
+ * **Description:** If set to `'True'` and `GET_COST_CODES` is also enabled, makes the selection of a cost code mandatory for users.
431
+ * **Default Value:** `'False'`
432
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
433
+
434
+ ## Whole Document API Options
435
+
436
+ Configurations for features related to processing whole documents via APIs, particularly AWS Textract for large documents.
437
+
438
+ * **`SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS`**
439
+ * **Description:** Controls whether UI options for whole document Textract calls are displayed. (Note: Mentioned as not currently implemented in the source).
440
+ * **Default Value:** `'False'`
441
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
442
+
443
+ * **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`**
444
+ * **Description:** The S3 bucket used for input and output of whole document analysis with AWS Textract.
445
+ * **Default Value:** `''` (empty string)
446
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
447
+
448
+ * **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER`**
449
+ * **Description:** The subfolder within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET` where input documents for Textract analysis are placed.
450
+ * **Default Value:** `'input'`
451
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
452
+
453
+ * **`TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER`**
454
+ * **Description:** The subfolder within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET` where output results from Textract analysis are stored.
455
+ * **Default Value:** `'output'`
456
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
457
+
458
+ * **`LOAD_PREVIOUS_TEXTRACT_JOBS_S3`**
459
+ * **Description:** If set to `'True'`, the application will attempt to load data from previous Textract jobs stored in S3.
460
+ * **Default Value:** `'False'`
461
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
462
+
463
+ * **`TEXTRACT_JOBS_S3_LOC`**
464
+ * **Description:** The S3 subfolder (within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`) where Textract job data (output) is stored.
465
+ * **Default Value:** `'output'`
466
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
467
+
468
+ * **`TEXTRACT_JOBS_S3_INPUT_LOC`**
469
+ * **Description:** The S3 subfolder (within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`) where Textract job input is stored.
470
+ * **Default Value:** `'input'`
471
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
472
+
473
+ * **`TEXTRACT_JOBS_LOCAL_LOC`**
474
+ * **Description:** The local subfolder where Textract job data is stored if not using S3 or as a cache.
475
+ * **Default Value:** `'output'`
476
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
477
+
478
+ * **`DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS`**
479
+ * **Description:** Specifies the number of past days for which to display whole document Textract jobs in the UI.
480
+ * **Default Value:** `'7'`
481
+ * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
src/faq.qmd ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "User FAQ"
3
+ format:
4
+ html:
5
+ toc: true # Enable the table of contents
6
+ toc-depth: 3 # Include headings up to level 2 (##)
7
+ toc-title: "On this page" # Optional: Title for your TOC
8
+ ---
9
+
10
+ ## General Advice:
11
+ * **Read the User Guide**: Many common questions are addressed in the detailed User Guide sections.
12
+ * **Start Simple**: If you're new, try redacting with default options first before customising extensively.
13
+ * **Human Review is Key**: Always manually review the `...redacted.pdf` or use the '**Review redactions**' tab. No automated system is perfect.
14
+ * **Save Incrementally**: When working on the '**Review redactions**' tab, use the '**Save changes on current page to file**' button periodically, especially for large documents.
15
+
16
+ ## General questions
17
+
18
+ #### What is document redaction and what does this app do?
19
+ Document redaction is the process of removing sensitive or personally identifiable information (PII) from documents. This application is a tool that automates this process for various document types, including PDFs, images, open text, and tabular data (`XLSX`/`CSV`/`Parquet`). It identifies potential PII using different methods and allows users to review, modify, and export the suggested redactions.
20
+
21
+ #### What types of documents and data can be redacted?
22
+ The app can handle a variety of formats. For documents, it supports `PDF`s and images (`JPG`, `PNG`). For tabular data, it works with `XLSX`, `CSV`, and `Parquet` files. Additionally, it can redact open text that is copied and pasted directly into the application interface.
23
+
24
+ #### How does the app identify text and PII for redaction?
25
+ The app employs several methods for text extraction and PII identification. Text can be extracted directly from selectable `PDF` text, using a local Optical Character Recognition (OCR) model for image-based content, or through the **AWS Textract service** for more complex documents, handwriting, and signatures (if available). For PII identification, it can use a local model based on the `spacy` package or the **AWS Comprehend service** for more accurate results (if available).
26
+
27
+ #### Can I customise what information is redacted?
28
+ Yes, the app offers extensive customisation options. You can define terms that should never be redacted (an '**allow list**'), terms that should always be redacted (a '**deny list**'), and specify entire pages to be fully redacted using `CSV` files. You can also select specific types of entities to redact, such as dates, or remove default entity types that are not relevant to your needs.
29
+
30
+ #### How can I review and modify the suggested redactions?
31
+ The app provides a dedicated '**Review redactions**' tab with a visual interface. You can upload the original document and the generated review file (`CSV`) to see the suggested redactions overlaid on the document. Here, you can move, resize, delete, and add new redaction boxes. You can also filter suggested redactions based on criteria and exclude them individually or in groups.
32
+
33
+ #### Can I work with tabular data or copy and pasted text?
34
+ Yes, the app has a dedicated tab for redacting tabular data files (`XLSX`/`CSV`) and open text. For tabular data, you can upload your file and select which columns to redact. For open text, you can simply paste the text into a box. You can then choose the redaction method and the desired output format for the anonymised data.
35
+
36
+ #### What are the options for the anonymisation format of redacted text?
37
+ When redacting tabular data or open text, you have several options for how the redacted information is replaced. The default is to replace the text with '**REDACTED**'. Other options include replacing it with the entity type (e.g., 'PERSON'), redacting completely (removing the text), replacing it with a consistent hash value, or masking it with stars ('*').
38
+
39
+ #### Can I export or import redactions to/from other software like Adobe Acrobat?
40
+ Yes, the app supports exporting and importing redaction data using the **Adobe Acrobat** comment file format (`.xfdf`). You can export suggested redactions from the app to an `.xfdf` file that can be opened in **Adobe**. Conversely, you can import an `.xfdf` file created in **Adobe** into the app to generate a review file (`CSV`) for further work within the application.
41
+
42
+ ## Troubleshooting
43
+
44
+ #### Q1: The app missed some personal information or redacted things it shouldn't have. Is it broken?
45
+ A: Not necessarily. The app is not 100% accurate and is designed as an aid. The `README` explicitly states: "**NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed by a human before using the final outputs.**"
46
+ * **Solution**: Always use the '**Review redactions**' tab to manually inspect, add, remove, or modify redactions.
47
+
48
+ #### Q2: I uploaded a `PDF`, but no text was found, or redactions are very poor using the '**Local model - selectable text**' option.
49
+ A: This option only works if your `PDF` has actual selectable text. If your `PDF` is an image scan (even if it looks like text), this method won't work well.
50
+ * **Solution**:
51
+ * Try the '**Local OCR model - PDFs without selectable text**' option. This uses Tesseract OCR to "read" the text from images.
52
+ * For best results, especially with complex documents, handwriting, or signatures, use the '**AWS Textract service - all PDF types**' if available.
53
+
54
+ #### Q3: Handwriting or signatures are not being redacted properly.
55
+ A: The '**Local**' text/OCR methods (selectable text or Tesseract) struggle with handwriting and signatures.
56
+ * **Solution**:
57
+ * Use the '**AWS Textract service**' for text extraction.
58
+ * Ensure that on the main '**Redact PDFs/images**' tab, under "**Optional - select signature extraction**" (when **AWS Textract** is chosen), you have enabled handwriting and/or signature detection. Note that signature detection has higher cost implications.
59
+
60
+ #### Q4: The options for '**AWS Textract service**' or '**AWS Comprehend**' are missing or greyed out.
61
+ A: These services are typically only available when the app is running in an **AWS** environment or has been specifically configured by your system admin to access these services (e.g., via `API` keys).
62
+ * **Solution**:
63
+ * Check if your instance of the app is supposed to have **AWS** services enabled.
64
+ * If running outside **AWS**, see the "**Using AWS Textract and Comprehend when not running in an AWS environment**" section in the advanced guide. This involves configuring **AWS** access keys, which should be done with IT and data security approval.
65
+
66
+ #### Q5: I re-processed the same document, and it seems to be taking a long time and potentially costing more with **AWS** services. Can I avoid this?
67
+ A: Yes. If you have previously processed a document with **AWS Textract** or the **Local OCR** model, the app generates a `.json` output file (`..._textract.json` or `..._ocr_results_with_words.json`).
68
+ * **Solution**: When re-uploading your original document for redaction, also upload the corresponding `.json` file. The app should detect this (the "**Existing Textract output file found**" box may be checked), skipping the expensive text extraction step.
69
+
70
+ #### Q6: My app crashed, or I reloaded the page. Are my output files lost?
71
+ A: If you are logged in via **AWS Cognito** and the server hasn't been shut down, you might be able to recover them.
72
+ * **Solution**: Go to the '**Redaction settings**' tab, scroll to the bottom, and look for '**View all output files from this session**'.
73
+
74
+ #### Q7: My custom allow list (terms to never redact) or deny list (terms to always redact) isn't working.
75
+ A: There are a few common reasons:
76
+ * **File Format**: Ensure your list is a `.csv` file with terms in the first column only, with no column header.
77
+ * **Case Sensitivity**: Terms in the allow/deny list are case sensitive.
78
+ * **Deny List & 'CUSTOM' Entity**: For a deny list to work, you must select the '**CUSTOM**' entity type in '**Redaction settings**' under '**Entities to redact**'.
79
+ * **Manual Additions**: If you manually added terms in the app interface (under '**Manually modify custom allow...**'), ensure you pressed `Enter` after typing each term in its cell.
80
+ * **Fuzzy Search for Deny List**: If you intend to use fuzzy matching for your deny list, ensure '**CUSTOM_FUZZY**' is selected as an entity type, and you've configured the "**maximum number of spelling mistakes allowed.**"
81
+
82
+ #### Q8: I'm trying to review redactions, but the `PDF` in the viewer looks like it's already redacted with black boxes.
83
+ A: You likely uploaded the `...redacted.pdf` file instead of the original document.
84
+ * **Solution**: On the '**Review redactions**' tab, ensure you upload the original, unredacted `PDF` alongside the `..._review_file.csv`.
85
+
86
+ #### Q9: I can't move or pan the document in the '**Review redactions**' viewer when zoomed in.
87
+ A: You are likely in "**add redaction boxes**" mode.
88
+ * **Solution**: Scroll to the bottom of the document viewer pane and click the hand icon. This switches to "**modify mode**," allowing you to pan the document by clicking and dragging, and also to move/resize existing redaction boxes.
89
+
90
+ #### Q10: I accidentally clicked "**Exclude all items in table from redactions**" on the '**Review redactions**' tab without filtering, and now all my redactions are gone!
91
+ A: This can happen if you don't apply a filter first.
92
+ * **Solution**: Click the '**Undo last element removal**' button immediately. This should restore the redactions. Always ensure you have clicked the blue tick icon next to the search box to apply your filter before using "**Exclude all items...**".
93
+
94
+ #### Q11: Redaction of my `CSV` or `XLSX` file isn't working correctly.
95
+ A: The app expects a specific format for tabular data.
96
+ * **Solution**: Ensure your data file has a simple table format, with the table starting in the first cell (`A1`). There should be no other information or multiple tables within the sheet you intend to redact. For `XLSX` files, each sheet to be redacted must follow this format.
97
+
98
+ #### Q12: The "**Identify duplicate pages**" feature isn't finding duplicates I expect, or it's flagging too many pages.
99
+ A: This feature uses text similarity based on the `ocr_outputs.csv` files and has a default similarity threshold (e.g., 90%).
100
+ * **Solution**:
101
+ * Ensure you've uploaded the correct `ocr_outputs.csv` files for all documents you're comparing.
102
+ * Review the `page_similarity_results.csv` output to see the similarity scores. The 90% threshold might be too high or too low for your specific documents. The current version of the app described doesn't seem to allow changing this threshold in the `UI`, so you'd mainly use the output to inform your manual review.
103
+
104
+ #### Q13: I exported a review file to Adobe (`.xfdf`), but when I open it in Adobe Acrobat, it can't find the `PDF` or shows no redactions.
105
+ A: When **Adobe Acrobat** prompts you, it needs to be pointed to the exact original `PDF`.
106
+ * **Solution**: Ensure you select the original, unredacted `PDF` file that was used to generate the `..._review_file.csv` (and subsequently the `.xfdf` file) when **Adobe Acrobat** asks for the associated document.
107
+
108
+ #### Q14: My **AWS Textract API** job (submitted via "**Submit whole document to AWS Textract API...**") is taking a long time, or I don't know if it's finished.
109
+ A: Large documents can take time. The document estimates about five seconds per page as a rough guide.
110
+ * **Solution**:
111
+ * After submitting, a **Job ID** will appear.
112
+ * Periodically click the '**Check status of Textract job and download**' button. Processing continues in the background.
113
+ * Once ready, the `_textract.json` output will appear in the output area.
114
+
115
+ #### Q15: I'm trying to redact specific terms from my deny list, but they are not being picked up, even though the '**CUSTOM**' entity is selected.
116
+ A: The deny list matches whole words with exact spelling by default.
117
+ * **Solution**:
118
+ * Double-check the spelling and case in your deny list.
119
+ * If you expect misspellings to be caught, you need to use the '**CUSTOM_FUZZY**' entity type and configure the "**maximum number of spelling mistakes allowed**" under '**Redaction settings**'. Then, upload your deny list.
120
+
121
+ #### Q16: I set the "**Lowest page to redact**" and "**Highest page to redact**" in '**Redaction settings**', but the app still seems to process or show redactions outside this range.
122
+ A: The page range setting primarily controls which pages have redactions applied in the final `...redacted.pdf`. The underlying text extraction (especially with OCR/Textract) might still process the whole document to generate the `...ocr_results.csv` or `..._textract.json`. When reviewing, the `review_file.csv` might initially contain all potential redactions found across the document.
123
+ * **Solution**:
124
+ * Ensure the `...redacted.pdf` correctly reflects the page range.
125
+ * When reviewing, use the page navigation and filters on the '**Review redactions**' tab to focus on your desired page range. The final application of redactions from the review tab should also respect the range if it's still set, but primarily it works off the `review_file.csv`.
126
+
127
+ #### Q17: My "**Full page redaction list**" isn't working. I uploaded a `CSV` with page numbers, but those pages aren't blacked out.
128
+ A: Common issues include:
129
+ * **File Format**: Ensure your list is a `.csv` file with page numbers in the first column only, with no column header. Each page number should be on a new row.
130
+ * **Redaction Task**: Simply uploading the list doesn't automatically redact. You need to:
131
+ 1. Upload the `PDF` you want to redact.
132
+ 2. Upload the full page redaction `CSV` in '**Redaction settings**'.
133
+ 3. It's often best to deselect all other entity types in '**Redaction settings**' if you only want to redact these full pages.
134
+ 4. Run the '**Redact document**' process. The output `...redacted.pdf` should show the full pages redacted, and the `...review_file.csv` will list these pages.
135
+
136
+ #### Q18: I merged multiple `...review_file.csv` files, but the output seems to have duplicate redaction boxes or some are missing.
137
+ A: The merge feature simply combines all rows from the input review files.
138
+ * **Solution**:
139
+ * **Duplicates**: If the same redaction (same location, text, label) was present in multiple input files, it will appear multiple times in the merged file. You'll need to manually remove these duplicates on the '**Review redactions**' tab or by editing the merged `...review_file.csv` in a spreadsheet editor before review.
140
+ * **Missing**: Double-check that all intended `...review_file.csv` files were correctly uploaded for the merge. Ensure the files themselves contained the expected redactions.
141
+
142
+ #### Q19: I imported an `.xfdf` Adobe comment file, but the `review_file.csv` generated doesn't accurately reflect the highlights or comments I made in Adobe Acrobat.
143
+ A: The app converts Adobe's comment/highlight information into its review_file format. Discrepancies can occur if:
144
+ * **Comment Types**: The app primarily looks for highlight-style annotations that it can interpret as redaction areas. Other Adobe comment types (e.g., sticky notes without highlights, text strike-throughs not intended as redactions) might not translate.
145
+ * **Complexity**: Very complex or unusually shaped Adobe annotations might not convert perfectly.
146
+ * **PDF Version**: Ensure the `PDF` uploaded alongside the `.xfdf` is the exact same original, unredacted `PDF` that the comments were made on in Adobe.
147
+ * **Solution**: After import, always open the generated `review_file.csv` (with the original `PDF`) on the '**Review redactions**' tab to verify and adjust as needed.
148
+
149
+ #### Q20: The **Textract API** job status table (under "**Submit whole document to AWS Textract API...**") only shows recent jobs, or I can't find an older **Job ID** I submitted.
150
+ A: The table showing **Textract** job statuses might have a limit or only show jobs from the current session or within a certain timeframe (e.g., "up to seven days old" is mentioned).
151
+ * **Solution**:
152
+ * It's good practice to note down the **Job ID** immediately after submission if you plan to check it much later.
153
+ * If the `_textract.json` file was successfully created from a previous job, you can re-upload that `.json` file with your original `PDF` to bypass the `API` call and proceed directly to redaction or OCR conversion.
154
+
155
+ #### Q21: I edited a `...review_file.csv` in Excel (e.g., changed coordinates, labels, colors), but when I upload it to the '**Review redactions**' tab, the boxes are misplaced, the wrong color, or it causes errors.
156
+ A: The `review_file.csv` has specific columns and data formats (e.g., coordinates, `RGB` color tuples like `(0,0,255)`).
157
+ * **Solution**:
158
+ * **Coordinates (xmin, ymin, xmax, ymax)**: Ensure these are numeric and make sense for `PDF` coordinates. Drastic incorrect changes can misplace boxes.
159
+ * **Colors**: Ensure the color column uses the `(R,G,B)` format, e.g., `(0,0,255)` for blue, not hex codes or color names, unless the app specifically handles that (the guide mentions `RGB`).
160
+ * **CSV Integrity**: Ensure you save the file strictly as a `CSV`. Excel sometimes adds extra formatting or changes delimiters if not saved carefully.
161
+ * **Column Order**: Do not change the order of columns in the `review_file.csv`.
162
+ * **Test Small Changes**: Modify one or two rows/values first to see the effect before making bulk changes.
163
+
164
+ #### Q22: The cost and time estimation feature isn't showing up, or it's giving unexpected results.
165
+ A: This feature depends on admin configuration and certain conditions.
166
+ * **Solution**:
167
+ * **Admin Enabled**: Confirm with your system admin that the cost/time estimation feature is enabled in the app's configuration.
168
+ * **AWS Services**: Estimation is typically most relevant when using **AWS Textract** or **Comprehend**. If you're only using '**Local**' models, the estimation might be simpler or not show **AWS**-related costs.
169
+ * **Existing Output**: If "**Existing Textract output file found**" is checked (because you uploaded a pre-existing `_textract.json`), the estimated cost and time should be significantly lower for the **Textract** part of the process.
170
+
171
+ #### Q23: I'm prompted for a "**cost code**," but I don't know what to enter, or my search isn't finding it.
172
+ A: Cost code selection is an optional feature enabled by system admins for tracking **AWS** usage.
173
+ * **Solution**:
174
+ * **Contact Admin/Team**: If you're unsure which cost code to use, consult your team lead or the system administrator who manages the redaction app. They should provide the correct code or guidance.
175
+ * **Search Tips**: Try searching by project name, department, or any known identifiers for your cost center. The search might be case-sensitive or require exact phrasing.
176
+
177
+ #### Q24: I selected "**hash**" as the anonymisation output format for my tabular data, but the output still shows "**REDACTED**" or something else.
178
+ A: Ensure the selection was correctly registered before redacting.
179
+ * **Solution**:
180
+ * Double-check on the '**Open text or Excel/csv files**' tab, under '**Anonymisation output format**,' that "**hash**" (or your desired format) is indeed selected.
181
+ * Try re-selecting it and then click '**Redact text/data files**' again.
182
+ * If the issue persists, it might be a bug or a specific interaction with your data type that prevents hashing. Report this to your app administrator. "**Hash**" should replace PII with a consistent unique `ID` for each unique piece of PII.
183
+
184
+ #### Q25: I'm using '**CUSTOM_FUZZY**' for my deny list. I have "**Should fuzzy search match on entire phrases in deny list**" checked, but it's still matching individual words within my phrases or matching things I don't expect.
185
+ A: Fuzzy matching on entire phrases can be complex. The "**maximum number of spelling mistakes allowed**" applies to the entire phrase.
186
+ * **Solution**:
187
+ * **Mistake Count**: If your phrase is long and the allowed mistakes are few, it might not find matches if the errors are distributed. Conversely, too many allowed mistakes on a short phrase can lead to over-matching. Experiment with the mistake count.
188
+ * **Specificity**: If "**match on entire phrases**" is unchecked, it will fuzzy match each individual word (excluding stop words) in your deny list phrases. This can be very broad. Ensure this option is set according to your needs.
189
+ * **Test with Simple Phrases**: Try a very simple phrase with a known, small number of errors to see if the core fuzzy logic is working as you expect, then build up complexity.
190
+
191
+ #### Q26: I "**locked in**" a new redaction box format on the '**Review redactions**' tab (label, colour), but now I want to change it or go back to the pop-up for each new box.
192
+ A: When a format is locked, a new icon (described as looking like a "**gift tag**") appears at the bottom of the document viewer.
193
+ * **Solution**:
194
+ * Click the "**gift tag**" icon at the bottom of the document viewer pane.
195
+ * This will allow you to change the default locked format.
196
+ * To go back to the pop-up appearing for each new box, click the lock icon within that "**gift tag**" menu again to "**unlock**" it (it should turn from blue to its original state).
197
+
198
+ #### Q27: I clicked "**Redact document**," processing seemed to complete (e.g., progress bar finished, "complete" message shown), but no output files (`...redacted.pdf`, `...review_file.csv`) appeared in the output area.
199
+ A: This could be due to various reasons:
200
+ * **No PII Found**: If absolutely no PII was detected according to your settings (entities, allow/deny lists), the app might not generate a `...redacted.pdf` if there's nothing to redact, though a `review_file.csv` (potentially empty) and `ocr_results.csv` should still ideally appear.
201
+ * **Error During File Generation**: An unhandled error might have occurred silently during the final file creation step.
202
+ * **Browser/UI Issue**: The `UI` might not have refreshed to show the files.
203
+ * **Permissions**: In rare cases, if running locally, there might be file system permission issues preventing the app from writing outputs.
204
+ * **Solution**:
205
+ * Try refreshing the browser page (if feasible without losing input data, or after re-uploading).
206
+ * Check the '**Redaction settings**' tab for '**View all output files from this session**' (if logged in via Cognito) – they might be listed there.
207
+ * Try a very simple document with obvious PII and default settings to see if any output is generated.
208
+ * Check browser developer console (`F12`) for any error messages.
209
+
210
+ #### Q28: When reviewing, I click on a row in the '**Search suggested redactions**' table. The page changes, but the specific redaction box isn't highlighted, or the view doesn't scroll to it.
211
+ A: The highlighting feature ("should change the colour of redaction box to blue") is an aid.
212
+ * **Solution**:
213
+ * Ensure you are on the correct page. The table click should take you there.
214
+ * The highlighting might be subtle or conflict with other `UI` elements. Manually scan the page for the text/label mentioned in the table row.
215
+ * Scrolling to the exact box isn't explicitly guaranteed, especially on very dense pages. The main function is page navigation.
216
+
217
+ #### Q29: I rotated a page in the '**Review redactions**' document viewer, and now all subsequent pages are also rotated, or if I navigate away and back, the rotation is lost.
218
+ A: The `README` states: "**When you switch page, the viewer will stay in your selected orientation, so if it looks strange, just rotate the page again and hopefully it will look correct!**"
219
+ * **Solution**:
220
+ * The rotation is a viewing aid for the current page session in the viewer. It does not permanently alter the original `PDF`.
221
+ * If subsequent pages appear incorrectly rotated, use the rotation buttons again for that new page.
222
+ * The rotation state might reset if you reload files or perform certain actions. Simply re-apply rotation as needed for viewing.
src/installation_guide.qmd ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "App installation guide (with CDK)"
3
+ format:
4
+ html:
5
+ toc: true # Enable the table of contents
6
+ toc-depth: 3 # Include headings up to level 2 (##)
7
+ toc-title: "On this page" # Optional: Title for your TOC
8
+ ---
9
+
10
+ # Introduction
11
+
12
+ This guide gives an overview of how to install the app in an AWS environment using the code in the cdk/ folder of this Github repo. The most important thing you need is some familiarity with AWS and how to use it via console or command line, as well as administrator access to at least one region. Then follow the below steps.
13
+
14
+ ## Prerequisites
15
+
16
+ * Install git on your computer from: [https://git-scm.com](https://git-scm.com)
17
+ * You will also need to install nodejs and npm: [https://docs.npmjs.com/downloading-and-installing-node-js-and-npm](https://docs.npmjs.com/downloading-and-installing-node-js-and-npm)
18
+ * You will need an AWS Administrator account in your desired region to install.
19
+ * You will need AWS CDK v2 installed: [https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
20
+ * You will need to bootstrap the environment with CDK in both your primary region, and `us-east-1` if installing CloudFront and associated WAF.
21
+ ```bash
22
+ # Bootstrap your primary region
23
+ cdk bootstrap aws://<YOUR_AWS_ACCOUNT>/eu-west-1
24
+
25
+ # Bootstrap the us-east-1 region
26
+ cdk bootstrap aws://<YOUR_AWS_ACCOUNT>/us-east-1
27
+ ```
28
+ * In command line, write:
29
+ ```bash
30
+ git clone https://github.com/seanpedrick-case/doc_redaction.git
31
+ ```
32
+
33
+ # VPC ACM Certificate
34
+
35
+ This CDK code is designed to work within an existing VPC. The code does not create a new VPC if it doesn't exist. So you will need to do that yourself.
36
+
37
+ Additionally, to get full HTTPS data transfer through the app, you will need an SSL certificate registered with AWS Certificate Manager.
38
+
39
+ You can either use the SSL certificate from a domain, or import an existing certificate into Certificate Manager. Ask your IT admin if you need help with this.
40
+
41
+ ## If getting an SSL certificate for an existing domain
42
+
43
+ Make sure to point the certificate to `*.<domain-name>`.
44
+
45
+ Update your DNS records to include the CNAME record given by AWS. After your stack has been created, you will also need to create a CNAME DNS record for your domain pointing to your load balancer DNS with a subdomain, e.g., `redaction.<domain-name>`.
46
+
47
+ 1. Create a python environment, load in packages from `requirements.txt`.
48
+
49
+ Need a `cdk.json` in the `cdk` folder. It should contain the following:
50
+
51
+ ```json
52
+ {
53
+ "app": "<PATH TO PYTHON ENVIRONMENT FOLDER WHERE REQUIREMENTS HAVE BEEN LOADED>python.exe app.py",
54
+ "context": {
55
+ "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
56
+ "@aws-cdk/core:stackRelativeExports": true,
57
+ "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
58
+ "@aws-cdk/aws-lambda:recognizeVersionProps": true,
59
+ "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
60
+ "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
61
+ "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
62
+ "@aws-cdk/core:newStyleStackSynthesis": true,
63
+ "aws-cdk:enableDiffNoFail": true,
64
+ "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
65
+ "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
66
+ "@aws-cdk/core:target-partitions": [
67
+ "aws",
68
+ "aws-cn"
69
+ ]
70
+ }
71
+ }
72
+ ```
73
+
74
+ 2. Create a `cdk_config.env` file in the `config` subfolder. Here as a minimum it would be useful to put the following details in the env file (below are example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
75
+
76
+ ```ini
77
+ CDK_PREFIX=example-prefix # Prefix to most created elements in your stack
78
+ VPC_NAME=example-vpc-name # Name of the VPC within which all the other elements will be created
79
+ AWS_REGION=us-west-1 # Region where elements will be created
80
+ AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
81
+ CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
82
+ CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
83
+ EXISTING_IGW_ID=igw-1234567890 # (optional) The ID for an existing internet gateway that you want to use instead of creating a new one
84
+ SINGLE_NAT_GATEWAY_ID=nat-123456789 # (optional) The ID for an existing NAT gateway that you want to use instead of creating a new one
85
+ COGNITO_USER_POOL_DOMAIN_PREFIX=lambeth-redaction-37924 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
86
+ RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
87
+ ```
88
+
89
+ **Note: If you are using an SSL certificate with Cognito login on the application load balancer, you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**
90
+
91
+ # Subnets
92
+
93
+ ### NOTE: I would generally advise creating new subnets as then you will be sure about connectivity between AWS resources that underpin your app.
94
+
95
+ * If you set no subnets, the app will try to use existing private and public subnets. This approach is risky as the app may overlap with IP addresses assigned to existing AWS resources. It is advised to at least specify existing subnets that you know are available, or create your own using one of the below methods.
96
+
97
+ * If you want to use existing subnets, you can list them in the following environment variables:
98
+ * `PUBLIC_SUBNETS_TO_USE=["PublicSubnet1", "PublicSubnet2", "PublicSubnet3"]`
99
+ * `PRIVATE_SUBNETS_TO_USE=["PrivateSubnet1", "PrivateSubnet2", "PrivateSubnet3"]`
100
+
101
+ * If you want to create new subnets, you need to also specify CIDR blocks and availability zones for the new subnets. The app will check with you upon deployment whether these CIDR blocks are available before trying to create.
102
+ * `PUBLIC_SUBNET_CIDR_BLOCKS=['10.222.333.0/28', '10.222.333.16/28', '10.222.333.32/28']`
103
+ * `PUBLIC_SUBNET_AVAILABILITY_ZONES=['eu-east-1a', 'eu-east-1b', 'eu-east-1c']`
104
+ * `PRIVATE_SUBNET_CIDR_BLOCKS=['10.222.333.48/28', '10.222.333.64/28', '10.222.333.80/28']`
105
+ * `PRIVATE_SUBNET_AVAILABILITY_ZONES=['eu-east-1a', 'eu-east-1b', 'eu-east-1c']`
106
+
107
+ If you try to create subnets in invalid CIDR blocks / availability zones, the console output will tell you and it will show you the currently occupied CIDR blocks to help find a space for new subnets you want to create.
108
+
109
+ 3. In command line in console, go to your `cdk` folder in the redaction app folder. Run `cdk deploy --all`. This should try to deploy the first stack in the `app.py` file.
110
+
111
+ Hopefully everything will deploy successfully and you will be able to see your new stack in CloudFormation in the AWS console.
112
+
113
+ 4. Tasks for after CDK deployment
114
+
115
+ # Tasks performed by `post_cdk_build_quickstart.py`
116
+
117
+ **Note:** The following tasks are done by the `post_cdk_build_quickstart.py` file that you can find in the `cdk` folder. You will need to run this when logged in with AWS SSO through command line. I will describe how to do this in AWS console just in case the `.py` file doesn't work for you.
118
+
119
+ ## Codebuild
120
+
121
+ Need to build CodeBuild project after stack has finished building, as there will be no container in ECR.
122
+
123
+ Go to CodeBuild -> your project -> click Start build. Check the logs, the build should be progressing.
124
+
125
+ ## Create a `config.env` file and upload to S3
126
+
127
+ The Fargate task definition references a `config.env` file.
128
+
129
+ Need to create a `config.env` file to upload to the S3 bucket that has the variables:
130
+
131
+ ```ini
132
+ COGNITO_AUTH=1
133
+ RUN_AWS_FUNCTIONS=1
134
+ SESSION_OUTPUT_FOLDER=True # If this is False it currently seems to fail to allow for writable log directories
135
+ ```
136
+
137
+ Go to S3 and choose the new `...-logs` bucket that you created. Upload the `config.env` file into this bucket.
138
+
139
+ ## Update Elastic Container Service
140
+
141
+ Now that the app container is in Elastic Container Registry, you can proceed to run the app on a Fargate server.
142
+
143
+ Go to your new cluster, your new service, and select 'Update service'.
144
+
145
+ Select 'Force new deployment', and then set 'Desired number of tasks' to 1.
146
+
147
+ # Additional Manual Tasks
148
+
149
+ # Update DNS records for your domain (If using a domain for the SSL certificate)
150
+
151
+ To do this, you need to create a CNAME DNS record for your domain pointing to your load balancer DNS from a subdomain of your main domain registration, e.g., `redaction.<domain-name>`.
152
+
153
+ # Cognito
154
+
155
+ Go to Cognito and create a user with your own email address. Generate a password.
156
+
157
+ Go to Cognito -> App clients -> Login pages -> View login page.
158
+
159
+ Enter the email and temporary password details that come in the email (don't include the last full stop!).
160
+
161
+ Change your password.
162
+
163
+ ## Set MFA (optional)
164
+ On the Cognito user pool page you can also enable MFA, if you are using an SSL certificate with Cognito login on the Application Load Balancer. Go to Cognito -> your user pool -> Sign in -> Multi-factor authentication
165
+
166
+ # Create CloudFront distribution
167
+ **Note: this is only relevant if you set `RUN_USEAST_STACK` to 'False' during CDK deployment**
168
+
169
+ If you were not able to create a CloudFront distribution via CDK, you should be able to do it through console. I would advise using CloudFront as the front end to the app.
170
+
171
+ Create a new CloudFront distribution.
172
+
173
+ * **If you have used an SSL certificate in your CDK code:**
174
+ * **For Origin:**
175
+ * Choose the domain name associated with the certificate as the origin.
176
+ * Choose HTTPS only as the protocol.
177
+ * Keep everything else default.
178
+ * **For Behavior (modify default behavior):**
179
+ * Under Viewer protocol policy choose 'Redirect HTTP to HTTPS'.
180
+
181
+ * **If you have not used an SSL certificate in your CDK code:**
182
+ * **For Origin:**
183
+ * Choose your elastic load balancer as the origin. This will fill in the elastic load balancer DNS.
184
+ * Choose HTTP only as the protocol.
185
+ * Keep everything else default.
186
+ * **For Behavior (modify default behavior):**
187
+ * Under Viewer protocol policy choose 'HTTP and HTTPS'.
188
+
189
+ ## Security features
190
+
191
+ In your CloudFront distribution, under 'Security' -> Edit -> Enable security protections.
192
+
193
+ Choose rate limiting (default is fine).
194
+
195
+ Create.
196
+
197
+ In CloudFront geographic restrictions -> Countries -> choose an Allow list of countries.
198
+
199
+ Click again on Edit.
200
+
201
+ AWS WAF protection enabled you should see a link titled 'View details of your configuration'.
202
+
203
+ Go to Rules -> `AWS-AWSManagedRulesCommonRuleSet`, click Edit.
204
+
205
+ Under `SizeRestrictions_BODY` choose rule action override 'Override to Allow'. This is needed to allow for file upload to the app.
206
+
207
+ # Change Cognito redirection URL to your CloudFront distribution
208
+
209
+ Go to Cognito -> your user pool -> App Clients -> Login pages -> Managed login configuration.
210
+
211
+ Ensure that the callback URL is:
212
+ * If not using an SSL certificate and Cognito login - `https://<CloudFront domain name>`
213
+ * If using an SSL certificate, you should have three:
214
+ * `https://<CloudFront domain name>`
215
+ * `https://<CloudFront domain name>/oauth2/idpresponse`
216
+ * `https://<CloudFront domain name>/oauth/idpresponse`
217
+
218
+ # Force traffic to come from specific CloudFront distribution (optional)
219
+
220
+ Note that this only potentially helps with security if you are not using an SSL certificate with Cognito login on your application load balancer.
221
+
222
+ Go to EC2 - Load Balancers -> Your load balancer -> Listeners -> Your listener -> Add rule.
223
+
224
+ * Add Condition -> Host header.
225
+ * Change Host header value to your CloudFront distribution without the `https://` or `http://` at the front.
226
+ * Forward to redaction target group.
227
+ * Turn on group stickiness for 12 hours.
228
+ * Next.
229
+ * Choose priority 1.
230
+
231
+ Then, change the default listener rule.
232
+
233
+ * Under Routing action change to 'Return fixed response'.
src/management_guide.qmd ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "User and AWS instance management guide"
3
+ format:
4
+ html:
5
+ toc: true # Enable the table of contents
6
+ toc-depth: 3 # Include headings up to level 2 (##)
7
+ toc-title: "On this page" # Optional: Title for your TOC
8
+ ---
9
+
10
+ This guide gives an overview of how to manage users of the redaction app, and how to start, stop, and manage instances of the app running on AWS Cloud.
11
+
12
+ # User management guide
13
+
14
+ This guide provides an overview for administrators to manage users within an AWS Cognito User Pool, specifically for an application utilising phone-app-based Two-Factor Authentication (2FA).
15
+
16
+ ## Managing Users in AWS Cognito User Pools
17
+
18
+ AWS Cognito User Pools provide a secure and scalable user directory for your applications. This guide focuses on common administrative tasks within the AWS Management Console.
19
+
20
+ ### Accessing Your User Pool
21
+
22
+ 1. Log in to the AWS Management Console.
23
+ 2. Navigate to **Cognito** (you can use the search bar).
24
+ 3. In the left navigation pane, select **User Pools**.
25
+ 4. Click on the name of the user pool associated with your redaction app.
26
+
27
+ ### Creating Users
28
+
29
+ Creating a new user in Cognito involves setting their initial credentials and attributes.
30
+
31
+ 1. From your User Pool's dashboard, click on the **Users** tab.
32
+ 2. Click the **Create user** button.
33
+ 3. **Username:** Enter a unique username for the user. This is what they will use to log in.
34
+ 4. **Temporary password:**
35
+ * Select **Generate a password** to have Cognito create a strong, temporary password.
36
+ * Alternatively, you can choose **Set a password** and enter one manually. If you do this, ensure it meets the password policy configured for your user pool.
37
+ * **Important:** Cognito will typically require users to change this temporary password upon their first login.
38
+ 5. **Email:** Enter the user's email address. This is crucial for communication and potentially for password recovery if configured.
39
+ 6. **Phone number (optional):** The phone number is not needed for login or user management in this app, you can leave this blank.
40
+ 7. **Mark email as verified/Mark phone number as verified:** For new users, you can choose to automatically verify their email and/or phone number. If unchecked, the user might need to verify these themselves during the signup process (depending on your User Pool's verification settings).
41
+ 8. **Groups (optional):** If you have defined groups in your user pool, you can add the user to relevant groups here. Groups are useful for managing permissions and access control within your application.
42
+ 9. Click **Create user**.
43
+
44
+ ### Information to Give to Users to Sign Up
45
+
46
+ Once a user is created, they'll need specific information to access the application.
47
+
48
+ * **Application URL:** The web address of your redaction app's login page.
49
+ * **Username:** The username you created for them in Cognito.
50
+ * **Temporary Password:** The temporary password you generated or set.
51
+ * **Instructions for First Login:**
52
+ * "Upon your first login, you will be prompted to change your temporary password to a new, secure password."
53
+ * "You will also need to set up Two-Factor Authentication using a phone authenticator app (e.g., Google Authenticator, Authy)."
54
+
55
+ ### Resetting User Access (Password Reset)
56
+
57
+ If a user forgets their password or needs their access reset, you can do this in the console.
58
+
59
+ 1. From your User Pool's dashboard, click on the **Users** tab.
60
+ 2. Locate the user you wish to reset. You can use the search bar.
61
+ 3. Click on the user's username.
62
+ 4. On the user details page, click the **Reset password** button.
63
+ 5. Cognito will generate a new temporary password and mark the user to change it on next login.
64
+ 6. **Important:** You will need to communicate this new temporary password to the user securely.
65
+
66
+ ### Two-Factor Authentication (2FA) with Apps Only
67
+
68
+ Your application uses phone app-based 2FA. This section covers what administrators need to know.
69
+
70
+ #### How it Works for the User
71
+
72
+ When a user logs in for the first time or when 2FA is enabled for their account, they will be prompted to set up 2FA. This typically involves:
73
+
74
+ 1. **Scanning a QR Code:** The application will display a QR code.
75
+ 2. **Using an Authenticator App:** The user opens their authenticator app (e.g., Google Authenticator, Authy, Microsoft Authenticator) and scans the QR code.
76
+ 3. **Entering a Code:** The authenticator app will generate a time-based one-time password (TOTP). The user enters this code into the application to verify the setup.
77
+
78
+ #### Administrator's Role in 2FA
79
+
80
+ As an administrator, you generally don't directly "set up" the user's 2FA device in the console. The user performs this self-enrollment process within the application. However, you can manage the 2FA status of a user:
81
+
82
+ 1. **Enabling/Disabling 2FA for a User:**
83
+ * From your User Pool's dashboard, click on the **Users** tab.
84
+ * Click on the user's username.
85
+ * Under the "Multi-factor authentication (MFA)" section, you'll see the current MFA status.
86
+ * If 2FA is not enabled, you might have the option to "Enable MFA" for the user. If your user pool requires 2FA, it might be automatically enabled upon signup.
87
+ * You can also **Disable MFA** for a user if necessary. This will remove their registered 2FA device and they will no longer be prompted for a 2FA code during login until they re-enroll.
88
+ 2. **Removing a User's 2FA Device:** If a user loses their phone or needs to re-configure 2FA, you can remove their existing MFA device.
89
+ * On the user's details page, under the "Multi-factor authentication (MFA)" section, you will see a list of registered MFA devices (if any).
90
+ * Select the device and click **Remove**.
91
+ * The next time the user logs in, they will be prompted to set up 2FA again.
92
+
93
+ ### Other Useful Information for Administrators
94
+
95
+ * **User Status:** In the "Users" tab, you'll see the status of each user (e.g., `CONFIRMED`, `UNCONFIRMED`, `FORCE_CHANGE_PASSWORD`, `ARCHIVED`, `COMPROMISED`).
96
+ * `CONFIRMED`: User has confirmed their account and set their password.
97
+ * `UNCONFIRMED`: User has been created but hasn't confirmed their account (e.g., through email verification) or changed their temporary password.
98
+ * `FORCE_CHANGE_PASSWORD`: User must change their password on next login.
99
+ * **Searching and Filtering Users:** The "Users" tab provides search and filtering options to quickly find specific users or groups of users.
100
+ * **User Attributes:** You can view and sometimes edit user attributes (like email, phone number, custom attributes) on the user's detail page.
101
+ * **Groups:**
102
+ * You can create and manage groups under the **Groups** tab of your User Pool.
103
+ * Groups are useful for organising users and applying different permissions or configurations through AWS Identity and Access Management (IAM) roles.
104
+ * **User Pool Settings:**
105
+ * Explore the various settings under the **User Pool Properties** tab (e.g., Policies, MFA and verifications, Message customisations).
106
+ * **Policies:** Define password complexity requirements.
107
+ * **MFA and verifications:** Configure whether MFA is optional, required, or disabled, and the types of MFA allowed (SMS, TOTP). Ensure "Authenticator apps" is enabled for your setup.
108
+ * **Message customisations:** Customise the email and SMS messages sent by Cognito (e.g., for verification codes, password resets).
109
+ * **Monitoring and Logging:**
110
+ * Integrate your Cognito User Pool with AWS CloudWatch to monitor user activities and potential issues.
111
+ * Enable CloudTrail logging for Cognito to track API calls and administrative actions.
112
+ * **Security Best Practices:**
113
+ * Always use strong, unique passwords for your AWS console login.
114
+ * Enable MFA for your AWS console login.
115
+ * Regularly review user access and permissions.
116
+ * Educate users on strong password practices and the importance of 2FA.
117
+
118
+ By understanding these features and following best practices, administrators can effectively manage users within their AWS Cognito User Pool, ensuring secure and smooth operation of their redaction application.
119
+
120
+ # Guide to running app instances on AWS
121
+
122
+ This guide provides basic instructions for administrators to manage service tasks within AWS Elastic Container Service (ECS) using the AWS Management Console, focusing on scaling services on and off and forcing redeployments.
123
+
124
+ ## Basic Service Task Management in AWS ECS Console
125
+
126
+ AWS Elastic Container Service (ECS) allows you to run, stop, and manage Docker containers on a cluster. This guide focuses on managing your ECS *services*, which maintain a desired number of tasks (container instances).
127
+
128
+ ### Accessing Your ECS Cluster and Services
129
+
130
+ 1. Log in to the AWS Management Console.
131
+ 2. Navigate to **ECS (Elastic Container Service)** (you can use the search bar).
132
+ 3. In the left navigation pane, select **Clusters**.
133
+ 4. Click on the name of the ECS cluster where your redaction app's service is running.
134
+
135
+ ### Understanding Services and Tasks
136
+
137
+ Before we dive into management, let's clarify key concepts:
138
+
139
+ * **Task Definition:** A blueprint for your application. It specifies the Docker image, CPU, memory, environment variables, port mappings, and other configurations for your containers.
140
+ * **Task:** An actual running instance of a task definition. It's an individual container or a set of tightly coupled containers running together.
141
+ * **Service:** A mechanism that allows you to run and maintain a specified number of identical tasks simultaneously in an ECS cluster. The service ensures that if a task fails or stops, it's replaced. It also handles load balancing and scaling.
142
+
143
+ ### Setting the Number of Running Tasks to 0 (Turning Everything Off)
144
+
145
+ Setting the desired number of tasks to 0 for a service effectively "turns off" your application by stopping all its running containers.
146
+
147
+ 1. From your Cluster's dashboard, click on the **Services** tab.
148
+ 2. Locate the service associated with your redaction app (e.g., `redaction-app-service`).
149
+ 3. Select the service by checking the box next to its name.
150
+ 4. Click the **Update** button.
151
+ 5. On the "Configure service" page, find the **Number of tasks** field.
152
+ 6. Change the value in this field to `0`.
153
+ 7. Scroll to the bottom and click **Update service**.
154
+
155
+ **What happens next:**
156
+
157
+ * ECS will begin terminating all running tasks associated with that service.
158
+ * The "Running tasks" count for your service will gradually decrease to 0.
159
+ * Your application will become inaccessible as its containers are stopped.
160
+
161
+ **Important Considerations:**
162
+
163
+ * **Cost Savings:** Setting tasks to 0 can save costs by stopping the consumption of compute resources (CPU, memory) for your containers.
164
+ * **Associated Resources:** This action *only* stops the ECS tasks. It does not stop underlying EC2 instances (if using EC2 launch type), associated databases, load balancers, or other AWS resources. You'll need to manage those separately if you want to completely shut down your environment.
165
+ * **Container Images:** Your Docker images will still reside in Amazon ECR (or wherever you store them).
166
+ * **Downtime:** This action will cause immediate downtime for your application.
167
+
168
+ ### Turning the Desired Number of Tasks On
169
+
170
+ To bring your application back online, you'll set the desired number of tasks to your operational value (usually 1 or more).
171
+
172
+ 1. From your Cluster's dashboard, click on the **Services** tab.
173
+ 2. Locate the service associated with your redaction app.
174
+ 3. Select the service by checking the box next to its name.
175
+ 4. Click the **Update** button.
176
+ 5. On the "Configure service" page, find the **Number of tasks** field.
177
+ 6. Change the value in this field to your desired number of running tasks (e.g., `1`, `2`, etc.).
178
+ 7. Scroll to the bottom and click **Update service**.
179
+
180
+ **What happens next:**
181
+
182
+ * ECS will begin launching new tasks based on your service's configuration and task definition.
183
+ * The "Running tasks" count will increase until it reaches your desired number.
184
+ * Once tasks are running and healthy (according to your health checks), your application should become accessible again.
185
+
186
+ **Important Considerations:**
187
+
188
+ * **Startup Time:** Allow some time for tasks to pull images, start containers, and pass health checks before your application is fully available.
189
+ * **Resource Availability:** Ensure your ECS cluster has sufficient available resources (EC2 instances or Fargate capacity) to launch the desired number of tasks.
190
+
191
+ ### Forcing Redeployment
192
+
193
+ Forcing a redeployment is useful when you've updated your task definition (e.g., pushed a new Docker image, changed environment variables) but the service hasn't automatically picked up the new version. It's also useful for "restarting" a service.
194
+
195
+ 1. From your Cluster's dashboard, click on the **Services** tab.
196
+ 2. Locate the service you want to redeploy.
197
+ 3. Select the service by checking the box next to its name.
198
+ 4. Click the **Update** button.
199
+ 5. On the "Configure service" page, scroll down to the **Deployment options** section.
200
+ 6. Check the box next to **Force new deployment**.
201
+ 7. Scroll to the bottom and click **Update service**.
202
+
203
+ **What happens next:**
204
+
205
+ * ECS will initiate a new deployment for your service.
206
+ * It will launch new tasks using the *latest active task definition revision* associated with your service.
207
+ * Existing tasks will be drained and terminated according to your service's deployment configuration (e.g., `minimum healthy percent`, `maximum percent`).
208
+ * This process effectively replaces all running tasks with fresh instances.
209
+
210
+ **Important Considerations:**
211
+
212
+ * **Latest Task Definition:** Ensure you have activated the correct and latest task definition revision before forcing a new deployment if your intention is to deploy new code. You can update the task definition used by a service via the "Update" service flow.
213
+ * **Downtime (minimal if configured correctly):** If your service has a properly configured load balancer and healthy deployment settings (e.g., blue/green or rolling updates), forced redeployments should result in minimal to no downtime. ECS will bring up new tasks before shutting down old ones.
214
+ * **Troubleshooting:** If a deployment gets stuck or tasks fail to start, check the "Events" tab of your service for error messages. Also, check the CloudWatch logs for your tasks.
215
+
216
+ ### Other Useful Information for Administrators
217
+
218
+ * **Service Events:** On your service's detail page, click the **Events** tab. This provides a chronological log of actions taken by the ECS service, such as task launches, stops, and scaling events. This is invaluable for troubleshooting.
219
+ * **Tasks Tab:** On your service's detail page, click the **Tasks** tab to see a list of all individual tasks running (or recently stopped) for that service. You can click on individual tasks to view their details, including logs, network configuration, and CPU/memory utilisation.
220
+ * **Logs:** For each task, you can often find a link to its CloudWatch Logs under the "Logs" section of the task details. This is critical for debugging application errors.
221
+ * **Metrics:** The **Metrics** tab on your service provides graphs for CPU utilisation, memory utilisation, and the number of running tasks, helping you monitor your service's performance.
222
+ * **Deployment Configuration:** When updating a service, review the **Deployment options** section. This allows you to control how new deployments are rolled out (e.g., minimum healthy percent, maximum percent). Proper configuration here ensures minimal impact during updates.
223
+ * **Auto Scaling (beyond basic management):** For dynamic scaling based on demand, explore **Service Auto Scaling**. This allows ECS to automatically adjust the desired number of tasks up or down based on metrics like CPU utilisation or request count.
224
+ * **Task Definitions:** Before updating a service, you might need to create a new revision of your task definition if you're deploying new code or configuration changes to your containers. You can find Task Definitions in the left navigation pane under ECS.
225
+
226
+ By mastering these basic service management operations in the AWS Console, administrators can effectively control the lifecycle of their ECS-based applications.
src/styles.css ADDED
@@ -0,0 +1 @@
 
 
1
+ /* Custom styles can be added here later */
src/user_guide.qmd ADDED
@@ -0,0 +1,543 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "User guide"
3
+ format:
4
+ html:
5
+ toc: true # Enable the table of contents
6
+ toc-depth: 3 # Include headings up to level 3 (##)
7
+ toc-title: "On this page" # Optional: Title for your TOC
8
+ ---
9
+
10
+ ## Table of contents
11
+
12
+ - [Example data files](#example-data-files)
13
+ - [Basic redaction](#basic-redaction)
14
+ - [Customising redaction options](#customising-redaction-options)
15
+ - [Custom allow, deny, and page redaction lists](#custom-allow-deny-and-page-redaction-lists)
16
+ - [Allow list example](#allow-list-example)
17
+ - [Deny list example](#deny-list-example)
18
+ - [Full page redaction list example](#full-page-redaction-list-example)
19
+ - [Redacting additional types of personal information](#redacting-additional-types-of-personal-information)
20
+ - [Redacting only specific pages](#redacting-only-specific-pages)
21
+ - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
22
+ - [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
23
+ - [Redacting tabular data files (CSV/XLSX) or copy and pasted text](#redacting-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
24
+
25
+ See the [advanced user guide here](#advanced-user-guide):
26
+ - [Merging redaction review files](#merging-redaction-review-files)
27
+ - [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages)
28
+ - [Fuzzy search and redaction](#fuzzy-search-and-redaction)
29
+ - [Export redactions to and import from Adobe Acrobat](#export-to-and-import-from-adobe)
30
+ - [Exporting to Adobe Acrobat](#exporting-to-adobe-acrobat)
31
+ - [Importing from Adobe Acrobat](#importing-from-adobe-acrobat)
32
+ - [Using the AWS Textract document API](#using-the-aws-textract-document-api)
33
+ - [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
34
+ - [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
35
+
36
+ ## Example data files
37
+
38
+ Please try these example files to follow along with this guide:
39
+ - [Example of files sent to a professor before applying](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf)
40
+ - [Example complaint letter (jpg)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_complaint_letter.jpg)
41
+ - [Partnership Agreement Toolkit (for signatures and more advanced usage)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf)
42
+ - [Dummy case note data](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv)
43
+
44
+ ## Basic redaction
45
+
46
+ The document redaction app can detect personally-identifiable information (PII) in documents. Documents can be redacted directly, or suggested redactions can be reviewed and modified using a grapical user interface. Basic document redaction can be performed quickly using the default options.
47
+
48
+ Download the example PDFs above to your computer. Open up the redaction app with the link provided by email.
49
+
50
+ ![Upload files](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/file_upload_highlight.PNG)
51
+
52
+ ### Upload files to the app
53
+
54
+ The 'Redact PDFs/images tab' currently accepts PDFs and image files (JPG, PNG) for redaction. Click on the 'Drop files here or Click to Upload' area of the screen, and select one of the three different [example files](#example-data-files) (they should all be stored in the same folder if you want them to be redacted at the same time).
55
+
56
+ ### Text extraction
57
+
58
+ First, select one of the three text extraction options:
59
+ - **'Local model - selectable text'** - This will read text directly from PDFs that have selectable text to redact (using PikePDF). This is fine for most PDFs, but will find nothing if the PDF does not have selectable text, and it is not good for handwriting or signatures. If it encounters an image file, it will send it onto the second option below.
60
+ - **'Local OCR model - PDFs without selectable text'** - This option will use a simple Optical Character Recognition (OCR) model (Tesseract) to pull out text from a PDF/image that it 'sees'. This can handle most typed text in PDFs/images without selectable text, but struggles with handwriting/signatures. If you are interested in the latter, then you should use the third option if available.
61
+ - **'AWS Textract service - all PDF types'** - Only available for instances of the app running on AWS. AWS Textract is a service that performs OCR on documents within their secure service. This is a more advanced version of OCR compared to the local option, and carries a (relatively small) cost. Textract excels in complex documents based on images, or documents that contain a lot of handwriting and signatures.
62
+
63
+ ### Optional - select signature extraction
64
+ If you chose the AWS Textract service above, you can choose if you want handwriting and/or signatures redacted by default. Choosing signatures here will have a cost implication, as identifying signatures will cost ~£2.66 ($3.50) per 1,000 pages vs ~£1.14 ($1.50) per 1,000 pages without signature detection.
65
+
66
+ ![AWS Textract handwriting and signature options](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_handwriting_signatures.PNG)
67
+
68
+ ### PII redaction method
69
+
70
+ If you are running with the AWS service enabled, here you will also have a choice for PII redaction method:
71
+ - **'Only extract text - (no redaction)'** - If you are only interested in getting the text out of the document for further processing (e.g. to find duplicate pages, or to review text on the Review redactions page)
72
+ - **'Local'** - This uses the spacy package to rapidly detect PII in extracted text. This method is often sufficient if you are just interested in redacting specific terms defined in a custom list.
73
+ - **'AWS Comprehend'** - This method calls an AWS service to provide more accurate identification of PII in extracted text.
74
+
75
+ ### Optional - costs and time estimation
76
+ If the option is enabled (by your system admin, in the config file), you will see a cost and time estimate for the redaction process. 'Existing Textract output file found' will be checked automatically if previous Textract text extraction files exist in the output folder, or have been [previously uploaded by the user](#aws-textract-outputs) (saving time and money for redaction).
77
+
78
+ ![Cost and time estimation](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/costs_and_time.PNG)
79
+
80
+ ### Optional - cost code selection
81
+ If the option is enabled (by your system admin, in the config file), you may be prompted to select a cost code before continuing with the redaction task.
82
+
83
+ ![Cost code selection](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/cost_code_selection.PNG)
84
+
85
+ The relevant cost code can be found either by: 1. Using the search bar above the data table to find relevant cost codes, then clicking on the relevant row, or 2. typing it directly into the dropdown to the right, where it should filter as you type.
86
+
87
+ ### Optional - Submit whole documents to Textract API
88
+ If this option is enabled (by your system admin, in the config file), you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here). This feature is described in more detail in the [advanced user guide](#using-the-aws-textract-document-api).
89
+
90
+ ![Textract document API](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_document_api.PNG)
91
+
92
+ ### Redact the document
93
+
94
+ Click 'Redact document'. After loading in the document, the app should be able to process about 30 pages per minute (depending on redaction methods chose above). When ready, you should see a message saying that processing is complete, with output files appearing in the bottom right.
95
+
96
+ ### Redaction outputs
97
+
98
+ ![Redaction outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/redaction_outputs.PNG)
99
+
100
+ - **'...redacted.pdf'** files contain the original pdf with suggested redacted text deleted and replaced by a black box on top of the document.
101
+ - **'...ocr_results.csv'** files contain the line-by-line text outputs from the entire document. This file can be useful for later searching through for any terms of interest in the document (e.g. using Excel or a similar program).
102
+ - **'...review_file.csv'** files are the review files that contain details and locations of all of the suggested redactions in the document. This file is key to the [review process](#reviewing-and-modifying-suggested-redactions), and should be downloaded to use later for this.
103
+
104
+ ### Additional AWS Textract / local OCR outputs
105
+
106
+ If you have used the AWS Textract option for extracting text, you may also see a '..._textract.json' file. This file contains all the relevant extracted text information that comes from the AWS Textract service. You can keep this file and upload it at a later date alongside your input document, which will enable you to skip calling AWS Textract every single time you want to do a redaction task, as follows:
107
+
108
+ ![Document upload alongside Textract](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/document_upload_with_textract.PNG)
109
+
110
+ Similarly, if you have used the 'Local OCR method' to extract text, you may see a '..._ocr_results_with_words.json' file. This file works in the same way as the AWS Textract .json results described above, and can be uploaded alongside an input document to save time on text extraction in future in the same way.
111
+
112
+ ### Downloading output files from previous redaction tasks
113
+
114
+ If you are logged in via AWS Cognito and you lose your app page for some reason (e.g. from a crash, reloading), it is possible recover your previous output files, provided the server has not been shut down since you redacted the document. Go to 'Redaction settings', then scroll to the bottom to see 'View all output files from this session'.
115
+
116
+ ![View all output files](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/view_all_output_files.PNG)
117
+
118
+ ### Basic redaction summary
119
+
120
+ We have covered redacting documents with the default redaction options. The '...redacted.pdf' file output may be enough for your purposes. But it is very likely that you will need to customise your redaction options, which we will cover below.
121
+
122
+ ## Customising redaction options
123
+
124
+ On the 'Redaction settings' page, there are a number of options that you can tweak to better match your use case and needs.
125
+
126
+ ### Custom allow, deny, and page redaction lists
127
+
128
+ The app allows you to specify terms that should never be redacted (an allow list), terms that should always be redacted (a deny list), and also to provide a list of page numbers for pages that should be fully redacted.
129
+
130
+ ![Custom allow, deny, and page redaction lists](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/allow_deny_full_page_list.PNG)
131
+
132
+ #### Allow list example
133
+
134
+ It may be the case that specific terms that are frequently redacted are not interesting to
135
+
136
+ In the redacted outputs of the 'Example of files sent to a professor before applying' PDF, you can see that it is frequently redacting references to Dr Hyde's lab in the main body of the text. Let's say that references to Dr Hyde were not considered personal information in this context. You can exclude this term from redaction (and others) by providing an 'allow list' file. This is simply a csv that contains the case sensitive terms to exclude in the first column, in our example, 'Hyde' and 'Muller glia'. The example file is provided [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/allow_list.csv).
137
+
138
+ To import this to use with your redaction tasks, go to the 'Redaction settings' tab, click on the 'Import allow list file' button halfway down, and select the csv file you have created. It should be loaded for next time you hit the redact button. Go back to the first tab and do this.
139
+
140
+ #### Deny list example
141
+
142
+ Say you wanted to remove specific terms from a document. In this app you can do this by providing a custom deny list as a csv. Like for the allow list described above, this should be a one-column csv without a column header. The app will suggest each individual term in the list with exact spelling as whole words. So it won't select text from within words. To enable this feature, the 'CUSTOM' tag needs to be chosen as a redaction entity [(the process for adding/removing entity types to redact is described below)](#redacting-additional-types-of-personal-information).
143
+
144
+ Here is an example using the [Partnership Agreement Toolkit file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf). This is an [example of a custom deny list file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/partnership_toolkit_redact_custom_deny_list.csv). 'Sister', 'Sister City'
145
+ 'Sister Cities', 'Friendship City' have been listed as specific terms to redact. You can see the outputs of this redaction process on the review page:
146
+
147
+ ![Deny list redaction Partnership file](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/deny_list_partnership_example.PNG).
148
+
149
+ You can see that the app has highlighted all instances of these terms on the page shown. You can then consider each of these terms for modification or removal on the review page [explained here](#reviewing-and-modifying-suggested-redactions).
150
+
151
+ #### Full page redaction list example
152
+
153
+ There may be full pages in a document that you want to redact. The app also provides the capability of redacting pages completely based on a list of input page numbers in a csv. The format of the input file is the same as that for the allow and deny lists described above - a one-column csv without a column header. An [example of this is here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/partnership_toolkit_redact_some_pages.csv). You can see an example of the redacted page on the review page:
154
+
155
+ ![Whole page partnership redaction](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/whole_page_partnership_example.PNG).
156
+
157
+ Using the above approaches to allow, deny, and full page redaction lists will give you an output [like this](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/allow_list/Partnership-Agreement-Toolkit_0_0_redacted.pdf).
158
+
159
+ #### Adding to the loaded allow, deny, and whole page lists in-app
160
+
161
+ If you open the accordion below the allow list options called 'Manually modify custom allow...', you should be able to see a few tables with options to add new rows:
162
+
163
+ ![Manually modify allow or deny list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify.PNG)
164
+
165
+ If the table is empty, you can add a new entry, you can add a new row by clicking on the '+' item below each table header. If there is existing data, you may need to click on the three dots to the right and select 'Add row below'. Type the item you wish to keep/remove in the cell, and then (important) press enter to add this new item to the allow/deny/whole page list. Your output tables should look something like below.
166
+
167
+ ![Manually modify allow or deny list filled](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/manually_modify_filled.PNG)
168
+
169
+ **Note:** As of version 0.7.0 you can now apply your whole page redaction list directly to the document file currently under review by clicking the 'Apply whole page redaction list to document currently under review' button that appears here.
170
+
171
+ ### Redacting additional types of personal information
172
+
173
+ You may want to redact additional types of information beyond the defaults, or you may not be interested in default suggested entity types. There are dates in the example complaint letter. Say we wanted to redact those dates also?
174
+
175
+ Under the 'Redaction settings' tab, go to 'Entities to redact (click close to down arrow for full list)'. Different dropdowns are provided according to whether you are using the Local service to redact PII, or the AWS Comprehend service. Click within the empty box close to the dropdown arrow and you should see a list of possible 'entities' to redact. Select 'DATE_TIME' and it should appear in the main list. To remove items, click on the 'x' next to their name.
176
+
177
+ ![Redacting additional types of information dropdown](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/additional_entities/additional_entities_select.PNG)
178
+
179
+ Now, go back to the main screen and click 'Redact Document' again. You should now get a redacted version of 'Example complaint letter' that has the dates and times removed.
180
+
181
+ If you want to redact different files, I suggest you refresh your browser page to start a new session and unload all previous data.
182
+
183
+ ## Redacting only specific pages
184
+
185
+ Say also we are only interested in redacting page 1 of the loaded documents. On the Redaction settings tab, select 'Lowest page to redact' as 1, and 'Highest page to redact' also as 1. When you next redact your documents, only the first page will be modified.
186
+
187
+ ![Selecting specific pages to redact](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/allow_list/select_pages.PNG)
188
+
189
+ ## Handwriting and signature redaction
190
+
191
+ The file [Partnership Agreement Toolkit (for signatures and more advanced usage)](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf) is provided as an example document to test AWS Textract + redaction with a document that has signatures in. If you have access to AWS Textract in the app, try removing all entity types from redaction on the Redaction settings and clicking the big X to the right of 'Entities to redact'.
192
+
193
+ To ensure that handwriting and signatures are enabled (enabled by default), on the front screen go the 'AWS Textract signature detection' to enable/disable the following options :
194
+
195
+ ![Handwriting and signatures](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/textract_handwriting_signatures.PNG)
196
+
197
+ The outputs should show handwriting/signatures redacted (see pages 5 - 7), which you can inspect and modify on the 'Review redactions' tab.
198
+
199
+ ![Handwriting and signatures redacted example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/refs/heads/main/review_redactions/Signatures%20and%20handwriting%20found.PNG)
200
+
201
+ ## Reviewing and modifying suggested redactions
202
+
203
+ Sometimes the app will suggest redactions that are incorrect, or will miss personal information entirely. The app allows you to review and modify suggested redactions to compensate for this. You can do this on the 'Review redactions' tab.
204
+
205
+ We will go through ways to review suggested redactions with an example.On the first tab 'PDFs/images' upload the ['Example of files sent to a professor before applying.pdf'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/example_of_emails_sent_to_a_professor_before_applying.pdf) file. Let's stick with the 'Local model - selectable text' option, and click 'Redact document'. Once the outputs are created, go to the 'Review redactions' tab.
206
+
207
+ On the 'Review redactions' tab you have a visual interface that allows you to inspect and modify redactions suggested by the app. There are quite a few options to look at, so we'll go from top to bottom.
208
+
209
+ ![Review redactions](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_redactions.PNG)
210
+
211
+ ### Uploading documents for review
212
+
213
+ The top area has a file upload area where you can upload original, unredacted PDFs, alongside the '..._review_file.csv' that is produced by the redaction process. Once you have uploaded these two files, click the '**Review redactions based on original PDF...**' button to load in the files for review. This will allow you to visualise and modify the suggested redactions using the interface below.
214
+
215
+ Optionally, you can also upload one of the '..._ocr_output.csv' files here that comes out of a redaction task, so that you can navigate the extracted text from the document.
216
+
217
+ ![Search extracted text](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
218
+
219
+ You can upload the three review files in the box (unredacted document, '..._review_file.csv' and '..._ocr_output.csv' file) before clicking '**Review redactions based on original PDF...**', as in the image below:
220
+
221
+ ![Upload three files for review](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/upload_three_files.PNG)
222
+
223
+ **NOTE:** ensure you upload the ***unredacted*** document here and not the redacted version, otherwise you will be checking over a document that already has redaction boxes applied!
224
+
225
+ ### Page navigation
226
+
227
+ You can change the page viewed either by clicking 'Previous page' or 'Next page', or by typing a specific page number in the 'Current page' box and pressing Enter on your keyboard. Each time you switch page, it will save redactions you have made on the page you are moving from, so you will not lose changes you have made.
228
+
229
+ You can also navigate to different pages by clicking on rows in the tables under 'Search suggested redactions' to the right, or 'search all extracted text' (if enabled) beneath that.
230
+
231
+ ### The document viewer pane
232
+
233
+ On the selected page, each redaction is highlighted with a box next to its suggested redaction label (e.g. person, email).
234
+
235
+ ![Document view pane](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/document_viewer_pane.PNG)
236
+
237
+ There are a number of different options to add and modify redaction boxes and page on the document viewer pane. To zoom in and out of the page, use your mouse wheel. To move around the page while zoomed, you need to be in modify mode. Scroll to the bottom of the document viewer to see the relevant controls. You should see a box icon, a hand icon, and two arrows pointing counter-clockwise and clockwise.
238
+
239
+ ![Change redaction mode](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/change_review_mode.PNG)
240
+
241
+ Click on the hand icon to go into modify mode. When you click and hold on the document viewer, This will allow you to move around the page when zoomed in. To rotate the page, you can click on either of the round arrow buttons to turn in that direction.
242
+
243
+ **NOTE:** When you switch page, the viewer will stay in your selected orientation, so if it looks strange, just rotate the page again and hopefully it will look correct!
244
+
245
+ #### Modify existing redactions (hand icon)
246
+
247
+ After clicking on the hand icon, the interface allows you to modify existing redaction boxes. When in this mode, you can click and hold on an existing box to move it.
248
+
249
+ ![Modify existing redaction box](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/modify_existing_redaction_box.PNG)
250
+
251
+ Click on one of the small boxes at the edges to change the size of the box. To delete a box, click on it to highlight it, then press delete on your keyboard. Alternatively, double click on a box and click 'Remove' on the box that appears.
252
+
253
+ ![Remove existing redaction box](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/existing_redaction_box_remove.PNG)
254
+
255
+ #### Add new redaction boxes (box icon)
256
+
257
+ To change to 'add redaction boxes' mode, scroll to the bottom of the page. Click on the box icon, and your cursor will change into a crosshair. Now you can add new redaction boxes where you wish. A popup will appear when you create a new box so you can select a label and colour for the new box.
258
+
259
+ #### 'Locking in' new redaction box format
260
+
261
+ It is possible to lock in a chosen format for new redaction boxes so that you don't have the popup appearing each time. When you make a new box, select the options for your 'locked' format, and then click on the lock icon on the left side of the popup, which should turn blue.
262
+
263
+ ![Lock redaction box format](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/new_redaction_box_lock_mode.PNG)
264
+
265
+ You can now add new redaction boxes without a popup appearing. If you want to change or 'unlock' the your chosen box format, you can click on the new icon that has appeared at the bottom of the document viewer pane that looks a little like a gift tag. You can then change the defaults, or click on the lock icon again to 'unlock' the new box format - then popups will appear again each time you create a new box.
266
+
267
+ ![Change or unlock redaction box format](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/change_review_mode_with_lock.PNG)
268
+
269
+ ### Apply redactions to PDF and Save changes on current page
270
+
271
+ Once you have reviewed all the redactions in your document and you are happy with the outputs, you can click 'Apply revised redactions to PDF' to create a new '_redacted.pdf' output alongside a new '_review_file.csv' output.
272
+
273
+ If you are working on a page and haven't saved for a while, you can click 'Save changes on current page to file' to ensure that they are saved to an updated 'review_file.csv' output.
274
+
275
+ ![Review modified outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_mod_outputs.PNG)
276
+
277
+ ### Selecting and removing redaction boxes using the 'Search suggested redactions' table
278
+
279
+ The table shows a list of all the suggested redactions in the document alongside the page, label, and text (if available).
280
+
281
+ ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/list_find_labels.PNG)
282
+
283
+ If you click on one of the rows in this table, you will be taken to the page of the redaction. Clicking on a redaction row on the same page will change the colour of redaction box to blue to help you locate it in the document viewer (just when using the app, not in redacted output PDFs).
284
+
285
+ ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/review_row_highlight.PNG)
286
+
287
+ You can choose a specific entity type to see which pages the entity is present on. If you want to go to the page specified in the table, you can click on a cell in the table and the review page will be changed to that page.
288
+
289
+ To filter the 'Search suggested redactions' table you can:
290
+ 1. Click on one of the dropdowns (Redaction category, Page, Text), and select an option, or
291
+ 2. Write text in the 'Filter' box just above the table. Click the blue box to apply the filter to the table.
292
+
293
+ Once you have filtered the table, or selected a row from the table, you have a few options underneath on what you can do with the filtered rows:
294
+
295
+ - Click the **Exclude all redactions in table** button to remove all redactions visible in the table from the document. **Important:** ensure that you have clicked the blue tick icon next to the search box before doing this, or you will remove all redactions from the document. If you do end up doing this, click the 'Undo last element removal' button below to restore the redactions.
296
+ - Click the **Exclude specific redaction row** button to remove only the redaction from the last row you clicked on from the document. The currently selected row is visible below.
297
+ - Click the **Exclude all redactions with the same text as selected row** button to remove all redactions from the document that are exactly the same as the selected row text.
298
+
299
+ **NOTE**: After excluding redactions using any of the above options, click the 'Reset filters' button below to ensure that the dropdowns and table return to seeing all remaining redactions in the document.
300
+
301
+ If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
302
+
303
+ ### Navigating through the document using the 'Search all extracted text'
304
+
305
+ The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
306
+
307
+ You can navigate through the document using this table. When you click on a row, the Document viewer pane to the left will change to the selected page.
308
+
309
+ ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/select_extracted_text.PNG)
310
+
311
+ You can search through the extracted text by using the search bar just above the table, which should filter as you type. To apply the filter and 'cut' the table, click on the blue tick inside the box next to your search term. To return the table to its original content, click the button below the table 'Reset OCR output table filter'.
312
+
313
+ ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
314
+
315
+ ## Redacting tabular data files (XLSX/CSV) or copy and pasted text
316
+
317
+ ### Tabular data files (XLSX/CSV)
318
+
319
+ The app can be used to redact tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
320
+
321
+ To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
322
+
323
+ ![csv upload](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/file_upload_csv_columns.PNG)
324
+
325
+ If you were instead to upload an xlsx file, you would see also a list of all the sheets in the xlsx file that can be redacted. The 'Select columns' area underneath will suggest a list of all columns in the file across all sheets.
326
+
327
+ ![xlsx upload](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/file_upload_xlsx_columns.PNG)
328
+
329
+ Once you have chosen your input file and sheets/columns to redact, you can choose the redaction method. 'Local' will use the same local model as used for documents on the first tab. 'AWS Comprehend' will give better results, at a slight cost.
330
+
331
+ When you click Redact text/data files, you will see the progress of the redaction task by file and sheet, and you will receive a csv output with the redacted data.
332
+
333
+ ### Choosing output anonymisation format
334
+ You can also choose the anonymisation format of your output results. Open the tab 'Anonymisation output format' to see the options. By default, any detected PII will be replaced with the word 'REDACTED' in the cell. You can choose one of the following options as the form of replacement for the redacted text:
335
+ - replace with 'REDACTED': Replaced by the word 'REDACTED' (default)
336
+ - replace with <ENTITY_NAME>: Replaced by e.g. 'PERSON' for people, 'EMAIL_ADDRESS' for emails etc.
337
+ - redact completely: Text is removed completely and replaced by nothing.
338
+ - hash: Replaced by a unique long ID code that is consistent with entity text. I.e. a particular name will always have the same ID code.
339
+ - mask: Replace with stars '*'.
340
+
341
+ ### Redacting copy and pasted text
342
+ You can also write open text into an input box and redact that using the same methods as described above. To do this, write or paste text into the 'Enter open text' box that appears when you open the 'Redact open text' tab. Then select a redaction method, and an anonymisation output format as described above. The redacted text will be printed in the output textbox, and will also be saved to a simple csv file in the output file box.
343
+
344
+ ![Text analysis output](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/tabular_files/text_anonymisation_outputs.PNG)
345
+
346
+ ### Redaction log outputs
347
+ A list of the suggested redaction outputs from the tabular data / open text data redaction is available on the Redaction settings page under 'Log file outputs'.
348
+
349
+ # ADVANCED USER GUIDE
350
+
351
+ This advanced user guide will go over some of the features recently added to the app, including: modifying and merging redaction review files, identifying and redacting duplicate pages across multiple PDFs, 'fuzzy' search and redact, and exporting redactions to Adobe Acrobat.
352
+
353
+ ## Table of contents
354
+
355
+ - [Merging redaction review files](#merging-redaction-review-files)
356
+ - [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages)
357
+ - [Fuzzy search and redaction](#fuzzy-search-and-redaction)
358
+ - [Export redactions to and import from Adobe Acrobat](#export-to-and-import-from-adobe)
359
+ - [Exporting to Adobe Acrobat](#exporting-to-adobe-acrobat)
360
+ - [Importing from Adobe Acrobat](#importing-from-adobe-acrobat)
361
+ - [Using the AWS Textract document API](#using-the-aws-textract-document-api)
362
+ - [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
363
+ - [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
364
+
365
+
366
+ ## Merging redaction review files
367
+
368
+ Say you have run multiple redaction tasks on the same document, and you want to merge all of these redactions together. You could do this in your spreadsheet editor, but this could be fiddly especially if dealing with multiple review files or large numbers of redactions. The app has a feature to combine multiple review files together to create a 'merged' review file.
369
+
370
+ ![Merging review files in the user interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/merge_review_files_interface.PNG)
371
+
372
+ You can find this option at the bottom of the 'Redaction Settings' tab. Upload multiple review files here to get a single output 'merged' review_file. In the examples file, merging the 'review_file_custom.csv' and 'review_file_local.csv' files give you an output containing redaction boxes from both. This combined review file can then be uploaded into the review tab following the usual procedure.
373
+
374
+ ![Merging review files outputs in spreadsheet](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/merged_review_file_outputs_csv.PNG)
375
+
376
+ ## Identifying and redacting duplicate pages
377
+
378
+ The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/).
379
+
380
+ Some redaction tasks involve removing duplicate pages of text that may exist across multiple documents. This feature helps you find and remove duplicate content that may exist in single or multiple documents. It can identify everything from single identical pages to multi-page sections (subdocuments). The process involves three main steps: configuring the analysis, reviewing the results in the interactive interface, and then using the generated files to perform the redactions.
381
+
382
+ ![Example duplicate page inputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_input_interface_new.PNG)
383
+
384
+ **Step 1: Upload and Configure the Analysis**
385
+ First, navigate to the "Identify duplicate pages" tab. Upload all the ocr_output.csv files you wish to compare into the file area. These files are generated every time you run a redaction task and contain the text for each page of a document.
386
+
387
+ For our example, you can upload the four 'ocr_output.csv' files provided in the example folder into the file area. Click 'Identify duplicate pages' and you will see a number of files returned. In case you want to see the original PDFs, they are available [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/duplicate_page_find_in_app/input_pdfs/).
388
+
389
+ The default options will search for matching subdocuments of any length. Before running the analysis, you can configure these matching parameters to tell the tool what you're looking for:
390
+
391
+ ![Duplicate matching parameters](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_matching_parameters.PNG)
392
+
393
+ *Matching Parameters*
394
+ - **Similarity Threshold:** A score from 0 to 1. Pages or sequences of pages with a calculated text similarity above this value will be considered a match. The default of 0.9 (90%) is a good starting point for finding near-identical pages.
395
+ - **Min Word Count:** Pages with fewer words than this value will be completely ignored during the comparison. This is extremely useful for filtering out blank pages, title pages, or boilerplate pages that might otherwise create noise in the results. The default is 10.
396
+ - **Choosing a Matching Strategy:** You have three main options to find duplicate content.
397
+ - *'Subdocument' matching (default):* Use this to find the longest possible sequence of matching pages. The tool will find an initial match and then automatically expand it forward page-by-page until the consecutive match breaks. This is the best method for identifying complete copied chapters or sections of unknown length. This is enabled by default by ticking the "Enable 'subdocument' matching" box. This setting overrides the described below.
398
+ - *Minimum length subdocument matching:* Use this to find sequences of consecutively matching pages with a minimum page lenght. For example, setting the slider to 3 will only return sections that are at least 3 pages long. How to enable: Untick the "Enable 'subdocument' matching" box and set the "Minimum consecutive pages" slider to a value greater than 1.
399
+ - *Single Page Matching:* Use this to find all individual page pairs that are similar to each other. Leave the "Enable 'subdocument' matching" box unchecked and keep the "Minimum consecutive pages" slider at 1.
400
+
401
+ Once your parameters are set, click the "Identify duplicate pages/subdocuments" button.
402
+
403
+ **Step 2: Review Results in the Interface**
404
+ After the analysis is complete, the results will be displayed directly in the interface.
405
+
406
+ *Analysis Summary:* A table will appear showing a summary of all the matches found. The columns will change depending on the matching strategy you chose. For subdocument matches, it will show the start and end pages of the matched sequence.
407
+
408
+ *Interactive Preview:* This is the most important part of the review process. Click on any row in the summary table. The full text of the matching page(s) will appear side-by-side in the "Full Text Preview" section below, allowing you to instantly verify the accuracy of the match.
409
+
410
+ ![Duplicate review interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_review_overview.PNG)
411
+
412
+ **Step 3: Download and Use the Output Files**
413
+ The analysis also generates a set of downloadable files for your records and for performing redactions.
414
+
415
+
416
+ - page_similarity_results.csv: This is a detailed report of the analysis you just ran. It shows a breakdown of the pages from each file that are most similar to each other above the similarity threshold. You can compare the text in the two columns 'Page_1_Text' and 'Page_2_Text'. For single-page matches, it will list each pair of matching pages. For subdocument matches, it will list the start and end pages of each matched sequence, along with the total length of the match.
417
+
418
+ ![Page similarity file example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/page_similarity_example.PNG)
419
+
420
+ - [Original_Filename]_pages_to_redact.csv: For each input document that was found to contain duplicate content, a separate redaction list is created. This is a simple, one-column CSV file containing a list of all page numbers that should be removed. To use these files, you can either upload the original document (i.e. the PDF) on the 'Review redactions' tab, and then click on the 'Apply relevant duplicate page output to document currently under review' button. You should see the whole pages suggested for redaction on the 'Review redactions' tab. Alternatively, you can reupload the file into the whole page redaction section as described in the ['Full page redaction list example' section](#full-page-redaction-list-example).
421
+
422
+ ![Example duplicate page redaction list](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/duplicate_page_find_in_app/img/duplicate_page_output_interface_new.PNG)
423
+
424
+ If you want to combine the results from this redaction process with previous redaction tasks for the same PDF, you could merge review file outputs following the steps described in [Merging existing redaction review files](#merging-existing-redaction-review-files) above.
425
+
426
+ ## Fuzzy search and redaction
427
+
428
+ The files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/fuzzy_search/).
429
+
430
+ Sometimes you may be searching for terns that are slightly mispelled throughout a document, for example names. The document redaction app gives the option for searching for long phrases that may contain spelling mistakes, a method called 'fuzzy matching'.
431
+
432
+ To do this, go to the Redaction Settings, and the 'Select entity types to redact' area. In the box below relevant to your chosen redaction method (local or AWS Comprehend), select 'CUSTOM_FUZZY' from the list. Next, we can select the maximum number of spelling mistakes allowed in the search (up to nine). Here, you can either type in a number or use the small arrows to the right of the box. Change this option to 3. This will allow for a maximum of three 'changes' in text needed to match to the desired search terms.
433
+
434
+ The other option we can leave as is (should fuzzy search match on entire phrases in deny list) - this option would allow you to fuzzy search on each individual word in the search phrase (apart from stop words).
435
+
436
+ Next, we can upload a deny list on the same page to do the fuzzy search. A relevant deny list file can be found [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/fuzzy_search/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv) - you can upload it following [these steps](#deny-list-example). You will notice that the suggested deny list has spelling mistakes compared to phrases found in the example document.
437
+
438
+ ![Deny list example with spelling mistakes](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/fuzzy_search/img/fuzzy_deny_list_example.PNG)
439
+
440
+ Upload the [Partnership-Agreement-Toolkit file](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/Partnership-Agreement-Toolkit_0_0.pdf) into the 'Redact document' area on the first tab. Now, press the 'Redact document' button.
441
+
442
+ Using these deny list with spelling mistakes, the app fuzzy match these terms to the correct text in the document. After redaction is complete, go to the Review Redactions tab to check the first tabs. You should see that the phrases in the deny list have been successfully matched.
443
+
444
+ ![Fuzzy match review outputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/fuzzy_search/img/fuzzy_search_review.PNG)
445
+
446
+ ## Export to and import from Adobe
447
+
448
+ Files for this section are stored [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/export_to_adobe/).
449
+
450
+ ### Exporting to Adobe Acrobat
451
+
452
+ The Document Redaction app has a feature to export suggested redactions to Adobe, and likewise to import Adobe comment files into the app. The file format used is the .xfdf Adobe comment file format - [you can find more information about how to use these files here](https://helpx.adobe.com/uk/acrobat/using/importing-exporting-comments.html).
453
+
454
+ To convert suggested redactions to Adobe format, you need to have the original PDF and a review file csv in the input box at the top of the Review redactions page.
455
+
456
+ ![Input area for files for Adobe export](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/adobe_export_input_area.PNG)
457
+
458
+ Then, you can find the export to Adobe option at the bottom of the Review redactions tab. Adobe comment files will be output here.
459
+
460
+ ![Adobe export/import options](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/export_to_adobe_interface.PNG)
461
+
462
+ Once the input files are ready, you can click on the 'Convert review file to Adobe comment format'. You should see a file appear in the output box with a '.xfdf' file type. To use this in Adobe, after download to your computer, you should be able to double click on it, and a pop-up box will appear asking you to find the PDF file associated with it. Find the original PDF file used for your redaction task. The file should be opened up in Adobe Acrobat with the suggested redactions.
463
+
464
+ ![Suggested redactions in Adobe Acrobat](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/adobe_redact_example.PNG)
465
+
466
+ ### Importing from Adobe Acrobat
467
+
468
+ The app also allows you to import .xfdf files from Adobe Acrobat. To do this, go to the same Adobe import/export area as described above at the bottom of the Review Redactions tab. In this box, you need to upload a .xfdf Adobe comment file, along with the relevant original PDF for redaction.
469
+
470
+ ![Adobe import interface](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface.PNG)
471
+
472
+ When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
473
+
474
+ ![Outputs from Adobe import](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/export_to_adobe/img/import_from_adobe_interface_outputs.PNG)
475
+
476
+ ## Using the AWS Textract document API
477
+
478
+ This option can be enabled by your system admin, in the config file ('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
479
+
480
+ ### Starting a new Textract API job
481
+
482
+ To use this feature, first upload a document file in the file input box [in the usual way](#upload-files-to-the-app) on the first tab of the app. Under AWS Textract signature detection you can select whether or not you would like to analyse signatures or not (with a [cost implication](#optional---select-signature-extraction)).
483
+
484
+ Then, open the section under the heading 'Submit whole document to AWS Textract API...'.
485
+
486
+ ![Textract document API menu](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/quick_start/textract_document_api.PNG)
487
+
488
+ Click 'Analyse document with AWS Textract API call'. After a few seconds, the job should be submitted to the AWS Textract service. The box 'Job ID to check status' should now have an ID filled in. If it is not already filled with previous jobs (up to seven days old), the table should have a row added with details of the new API job.
489
+
490
+ Click the button underneath, 'Check status of Textract job and download', to see progress on the job. Processing will continue in the background until the job is ready, so it is worth periodically clicking this button to see if the outputs are ready. In testing, and as a rough estimate, it seems like this process takes about five seconds per page. However, this has not been tested with very large documents. Once ready, the '_textract.json' output should appear below.
491
+
492
+ ### Textract API job outputs
493
+
494
+ The '_textract.json' output can be used to speed up further redaction tasks as [described previously](#optional---costs-and-time-estimation), the 'Existing Textract output file found' flag should now be ticked.
495
+
496
+ ![Textract document API initial ouputs](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/textract_api/textract_api_initial_outputs.PNG)
497
+
498
+ You can now easily get the '..._ocr_output.csv' redaction output based on this '_textract.json' (described in [Redaction outputs](#redaction-outputs)) by clicking on the button 'Convert Textract job outputs to OCR results'. You can now use this file e.g. for [identifying duplicate pages](#identifying-and-redacting-duplicate-pages), or for redaction review.
499
+
500
+ ## Using AWS Textract and Comprehend when not running in an AWS environment
501
+
502
+ AWS Textract and Comprehend give much better results for text extraction and document redaction than the local model options in the app. The most secure way to access them in the Redaction app is to run the app in a secure AWS environment with relevant permissions. Alternatively, you could run the app on your own system while logged in to AWS SSO with relevant permissions.
503
+
504
+ However, it is possible to access these services directly via API from outside an AWS environment by creating IAM users and access keys with relevant permissions to access AWS Textract and Comprehend services. Please check with your IT and data security teams that this approach is acceptable for your data before trying the following approaches.
505
+
506
+ To do the following, in your AWS environment you will need to create a new user with permissions for "textract:AnalyzeDocument", "textract:DetectDocumentText", and "comprehend:DetectPiiEntities". Under security credentials, create new access keys - note down the access key and secret key.
507
+
508
+ ### Direct access by passing AWS access keys through app
509
+ The Redaction Settings tab now has boxes for entering the AWS access key and secret key. If you paste the relevant keys into these boxes before performing redaction, you should be able to use these services in the app.
510
+
511
+ ### Picking up AWS access keys through an .env file
512
+ The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
513
+
514
+ AWS_ACCESS_KEY= your-access-key
515
+ AWS_SECRET_KEY= your-secret-key
516
+
517
+ The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
518
+
519
+ Again, a lot can potentially go wrong with AWS solutions that are insecure, so before trying the above please consult with your AWS and data security teams.
520
+
521
+ ## Modifying existing redaction review files
522
+
523
+ *Note:* As of version 0.7.0 you can now modify redaction review files directly in the app on the 'Review redactions' tab. Open the accordion 'View and edit review data' under the file input area. You can edit review file data cells here - press Enter to apply changes. You should see the effect on the current page if you click the 'Save changes on current page to file' button to the right.
524
+
525
+ You can find the folder containing the files discussed in this section [here](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/).
526
+
527
+ As well as serving as inputs to the document redaction app's review function, the 'review_file.csv' output can be modified outside of the app, and also merged with others from multiple redaction attempts on the same file. This gives you the flexibility to change redaction details outside of the app.
528
+
529
+ If you open up a 'review_file' csv output using a spreadsheet software program such as Microsoft Excel you can easily modify redaction properties. Open the file '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local.csv)', and you should see a spreadshet with just four suggested redactions (see below). The following instructions are for using Excel.
530
+
531
+ ![Review file before](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/review_file_before.PNG)
532
+
533
+ The first thing we can do is remove the first row - 'et' is suggested as a person, but is obviously not a genuine instance of personal information. Right click on the row number and select delete on this menu. Next, let's imagine that what the app identified as a 'phone number' was in fact another type of number and so we wanted to change the label. Simply click on the relevant label cells, let's change it to 'SECURITY_NUMBER'. You could also use 'Find & Select' -> 'Replace' from the top ribbon menu if you wanted to change a number of labels simultaneously.
534
+
535
+ How about we wanted to change the colour of the 'email address' entry on the redaction review tab of the redaction app? The colours in a review file are based on an RGB scale with three numbers ranging from 0-255. [You can find suitable colours here](https://rgbcolorpicker.com). Using this scale, if I wanted my review box to be pure blue, I can change the cell value to (0,0,255).
536
+
537
+ Imagine that a redaction box was slightly too small, and I didn't want to use the in-app options to change the size. In the review file csv, we can modify e.g. the ymin and ymax values for any box to increase the extent of the redaction box. For the 'email address' entry, let's decrease ymin by 5, and increase ymax by 5.
538
+
539
+ I have saved an output file following the above steps as '[Partnership-Agreement-Toolkit_0_0_redacted.pdf_review_file_local_mod.csv](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/merge_review_files/outputs/Partnership-Agreement-Toolkit_0_0.pdf_review_file_local_mod.csv)' in the same folder that the original was found. Let's upload this file to the app along with the original pdf to see how the redactions look now.
540
+
541
+ ![Review file after modification](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/merge_review_files/img/partnership_redactions_after.PNG)
542
+
543
+ We can see from the above that we have successfully removed a redaction box, changed labels, colours, and redaction box sizes.
tld/.tld_set_snapshot DELETED
The diff for this file is too large to render. See raw diff
 
tools/aws_functions.py CHANGED
@@ -228,5 +228,6 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
228
  print(final_out_message_str)
229
  else:
230
  final_out_message_str = "App not set to run AWS functions"
 
231
 
232
  return final_out_message_str
 
228
  print(final_out_message_str)
229
  else:
230
  final_out_message_str = "App not set to run AWS functions"
231
+ print(final_out_message_str)
232
 
233
  return final_out_message_str
tools/config.py CHANGED
@@ -64,10 +64,12 @@ def add_folder_to_path(folder_path: str):
64
  # LOAD CONFIG FROM ENV FILE
65
  ###
66
 
67
- ensure_folder_exists("config/")
 
 
68
 
69
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
70
- APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') # e.g. config/app_config.env
71
 
72
  if APP_CONFIG_PATH:
73
  if os.path.exists(APP_CONFIG_PATH):
@@ -75,10 +77,6 @@ if APP_CONFIG_PATH:
75
  load_dotenv(APP_CONFIG_PATH)
76
  else: print("App config file not found at location:", APP_CONFIG_PATH)
77
 
78
-
79
-
80
-
81
-
82
  ###
83
  # AWS OPTIONS
84
  ###
@@ -149,6 +147,12 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
149
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
150
 
151
 
 
 
 
 
 
 
152
  ###
153
  # LOGGING OPTIONS
154
  ###
@@ -182,7 +186,7 @@ DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS',
182
 
183
  CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
184
  CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
185
- CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
186
 
187
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
188
 
@@ -310,7 +314,7 @@ COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") #
310
  # APP RUN OPTIONS
311
  ###
312
 
313
- TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
314
  try:
315
  extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
316
  except:
 
64
  # LOAD CONFIG FROM ENV FILE
65
  ###
66
 
67
+ CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
68
+
69
+ ensure_folder_exists(CONFIG_FOLDER)
70
 
71
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
72
+ APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
73
 
74
  if APP_CONFIG_PATH:
75
  if os.path.exists(APP_CONFIG_PATH):
 
77
  load_dotenv(APP_CONFIG_PATH)
78
  else: print("App config file not found at location:", APP_CONFIG_PATH)
79
 
 
 
 
 
80
  ###
81
  # AWS OPTIONS
82
  ###
 
147
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
148
 
149
 
150
+ GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
151
+ MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
152
+
153
+ ensure_folder_exists(GRADIO_TEMP_DIR)
154
+ ensure_folder_exists(MPLCONFIGDIR)
155
+
156
  ###
157
  # LOGGING OPTIONS
158
  ###
 
186
 
187
  CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
188
  CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
189
+ CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
190
 
191
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
192
 
 
314
  # APP RUN OPTIONS
315
  ###
316
 
317
+ TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tmp/tld/')
318
  try:
319
  extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
320
  except:
tools/file_conversion.py CHANGED
@@ -385,28 +385,32 @@ def convert_pymupdf_to_image_coords(pymupdf_page:Page, x1:float, y1:float, x2:fl
385
 
386
  return x1_image, y1_image, x2_image, y2_image
387
 
388
- def redact_whole_pymupdf_page(rect_height:float, rect_width:float, image:Image, page:Page, custom_colours, border:float = 5, image_dimensions:dict={}):
389
  # Small border to page that remains white
390
- border = 5
391
  # Define the coordinates for the Rect
392
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
393
- whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
394
 
395
- # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image, image_dimensions=image_dimensions)
 
 
 
 
396
 
397
  # Create new image annotation element based on whole page coordinates
398
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
399
 
400
  # Write whole page annotation to annotation boxes
401
  whole_page_img_annotation_box = {}
402
- whole_page_img_annotation_box["xmin"] = whole_page_x1 #whole_page_image_x1
403
- whole_page_img_annotation_box["ymin"] = whole_page_y1 #whole_page_image_y1
404
- whole_page_img_annotation_box["xmax"] = whole_page_x2 #whole_page_image_x2
405
- whole_page_img_annotation_box["ymax"] = whole_page_y2 #whole_page_image_y2
406
  whole_page_img_annotation_box["color"] = (0,0,0)
407
  whole_page_img_annotation_box["label"] = "Whole page"
408
 
409
- redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
 
410
 
411
  return whole_page_img_annotation_box
412
 
@@ -1292,7 +1296,13 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1292
  df = pd.DataFrame({
1293
  "image": [anno.get("image") for anno in all_annotations],
1294
  # Ensure 'boxes' defaults to an empty list if missing or None
1295
- "boxes": [anno.get("boxes") if isinstance(anno.get("boxes"), list) else [] for anno in all_annotations]
 
 
 
 
 
 
1296
  })
1297
 
1298
  # 2. Calculate the page number using the helper function
 
385
 
386
  return x1_image, y1_image, x2_image, y2_image
387
 
388
+ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool=False, border:float = 5, redact_pdf:bool=True):
389
  # Small border to page that remains white
390
+
391
  # Define the coordinates for the Rect
392
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
 
393
 
394
+ # If border is a tiny value, assume that we want relative values
395
+ if border < 0.1:
396
+ whole_page_x2, whole_page_y2 = 1 - border, 1 - border # Top-right corner
397
+ else:
398
+ whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
399
 
400
  # Create new image annotation element based on whole page coordinates
401
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
402
 
403
  # Write whole page annotation to annotation boxes
404
  whole_page_img_annotation_box = {}
405
+ whole_page_img_annotation_box["xmin"] = whole_page_x1
406
+ whole_page_img_annotation_box["ymin"] = whole_page_y1
407
+ whole_page_img_annotation_box["xmax"] = whole_page_x2
408
+ whole_page_img_annotation_box["ymax"] = whole_page_y2
409
  whole_page_img_annotation_box["color"] = (0,0,0)
410
  whole_page_img_annotation_box["label"] = "Whole page"
411
 
412
+ if redact_pdf == True:
413
+ redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
414
 
415
  return whole_page_img_annotation_box
416
 
 
1296
  df = pd.DataFrame({
1297
  "image": [anno.get("image") for anno in all_annotations],
1298
  # Ensure 'boxes' defaults to an empty list if missing or None
1299
+ "boxes": [
1300
+ anno.get("boxes") if isinstance(anno.get("boxes"), list)
1301
+ else [anno.get("boxes")] if isinstance(anno.get("boxes"), dict)
1302
+ else []
1303
+ for anno in all_annotations
1304
+ ]
1305
+
1306
  })
1307
 
1308
  # 2. Calculate the page number using the helper function
tools/file_redaction.py CHANGED
@@ -1114,7 +1114,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
1114
  # If whole page is to be redacted, do that here
1115
  if redact_whole_page == True:
1116
 
1117
- whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5, image_dimensions=image_dimensions)
1118
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
1119
 
1120
  out_annotation_boxes = {
@@ -1372,10 +1372,19 @@ def redact_image_pdf(file_path:str,
1372
  if current_loop_page == 0: page_loop_start = 0
1373
  else: page_loop_start = current_loop_page
1374
 
1375
- progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1376
 
1377
- all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
1378
- all_pages_decision_process_table_list = [all_pages_decision_process_table]
 
 
 
 
 
 
 
 
 
1379
 
1380
  # Go through each page
1381
  for page_no in progress_bar:
@@ -1525,7 +1534,10 @@ def redact_image_pdf(file_path:str,
1525
  'height': result.height
1526
  } for result in page_line_level_ocr_results['results']])
1527
 
1528
- all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
 
 
 
1529
 
1530
  if pii_identification_method != NO_REDACTION_PII_OPTION:
1531
  # Step 2: Analyse text and identify PII
@@ -1637,7 +1649,10 @@ def redact_image_pdf(file_path:str,
1637
  'page': reported_page_number
1638
  } for result in page_merged_redaction_bboxes])
1639
 
1640
- all_pages_decision_process_table_list.append(decision_process_table)
 
 
 
1641
 
1642
  decision_process_table = fill_missing_ids(decision_process_table)
1643
  decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
@@ -1685,8 +1700,11 @@ def redact_image_pdf(file_path:str,
1685
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1686
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1687
 
1688
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1689
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
 
 
 
1690
 
1691
 
1692
  current_loop_page += 1
@@ -1733,9 +1751,11 @@ def redact_image_pdf(file_path:str,
1733
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1734
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1735
 
 
 
1736
 
1737
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1738
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1739
 
1740
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1741
 
@@ -1758,8 +1778,8 @@ def redact_image_pdf(file_path:str,
1758
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1759
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1760
 
1761
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1762
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1763
 
1764
  # Convert decision table and ocr results to relative coordinates
1765
  all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
@@ -2002,11 +2022,11 @@ def redact_text_pdf(
2002
  tic = time.perf_counter()
2003
 
2004
  if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
2005
- all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
2006
 
2007
  if isinstance(all_pages_decision_process_table, pd.DataFrame):
2008
  # Convert decision outputs to list of dataframes:
2009
- all_pages_decision_process_table_list = [all_pages_decision_process_table]
2010
 
2011
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
2012
  out_message = "Connection to AWS Comprehend service not found."
@@ -2133,7 +2153,7 @@ def redact_text_pdf(
2133
  page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
2134
 
2135
  if not page_decision_process_table.empty:
2136
- all_pages_decision_process_table_list.append(page_decision_process_table)
2137
 
2138
  # Else, user chose not to run redaction
2139
  else:
@@ -2145,7 +2165,7 @@ def redact_text_pdf(
2145
  if not page_text_ocr_outputs.empty:
2146
  page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
2147
  page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
2148
- all_line_level_ocr_results_df_list.append(page_text_ocr_outputs)
2149
 
2150
  toc = time.perf_counter()
2151
 
@@ -2168,8 +2188,8 @@ def redact_text_pdf(
2168
  annotations_all_pages.append(page_image_annotations)
2169
 
2170
  # Write logs
2171
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2172
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
2173
 
2174
 
2175
  current_loop_page += 1
@@ -2193,16 +2213,16 @@ def redact_text_pdf(
2193
  progress.close(_tqdm=progress_bar)
2194
 
2195
  # Write logs
2196
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2197
 
2198
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2199
 
2200
  # Write all page outputs
2201
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2202
 
2203
- #print("all_line_level_ocr_results_df_list:", all_line_level_ocr_results_df_list)
2204
 
2205
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
2206
 
2207
  #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
2208
 
 
1114
  # If whole page is to be redacted, do that here
1115
  if redact_whole_page == True:
1116
 
1117
+ whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, page, custom_colours, border = 5)
1118
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
1119
 
1120
  out_annotation_boxes = {
 
1372
  if current_loop_page == 0: page_loop_start = 0
1373
  else: page_loop_start = current_loop_page
1374
 
1375
+ progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1376
 
1377
+ # If there's data from a previous run (passed in via the DataFrame parameters), add it
1378
+ all_line_level_ocr_results_list = []
1379
+ all_pages_decision_process_list = []
1380
+
1381
+ if not all_line_level_ocr_results_df.empty:
1382
+ all_line_level_ocr_results_list.extend(all_line_level_ocr_results_df.to_dict('records'))
1383
+ if not all_pages_decision_process_table.empty:
1384
+ all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
1385
+
1386
+ #all_line_level_ocr_results_list = [all_line_level_ocr_results_df.to_dict('records')]#[all_line_level_ocr_results_df]
1387
+ #all_pages_decision_process_list = [all_pages_decision_process_table.to_dict('records')]#[all_pages_decision_process_table]
1388
 
1389
  # Go through each page
1390
  for page_no in progress_bar:
 
1534
  'height': result.height
1535
  } for result in page_line_level_ocr_results['results']])
1536
 
1537
+ #all_line_level_ocr_results_list.append(line_level_ocr_results_df.to_dict('records'))
1538
+
1539
+ if not line_level_ocr_results_df.empty: # Ensure there are records to add
1540
+ all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
1541
 
1542
  if pii_identification_method != NO_REDACTION_PII_OPTION:
1543
  # Step 2: Analyse text and identify PII
 
1649
  'page': reported_page_number
1650
  } for result in page_merged_redaction_bboxes])
1651
 
1652
+ #all_pages_decision_process_list.append(decision_process_table.to_dict('records'))
1653
+
1654
+ if not decision_process_table.empty: # Ensure there are records to add
1655
+ all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
1656
 
1657
  decision_process_table = fill_missing_ids(decision_process_table)
1658
  decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
 
1700
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1701
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1702
 
1703
+ #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
1704
+ #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
1705
+
1706
+ all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1707
+ all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
1708
 
1709
 
1710
  current_loop_page += 1
 
1751
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1752
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1753
 
1754
+ #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
1755
+ #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
1756
 
1757
+ all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1758
+ all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
1759
 
1760
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1761
 
 
1778
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1779
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1780
 
1781
+ all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list) #pd.concat(all_pages_decision_process_list)
1782
+ all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list) #pd.concat(all_line_level_ocr_results_list)
1783
 
1784
  # Convert decision table and ocr results to relative coordinates
1785
  all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
 
2022
  tic = time.perf_counter()
2023
 
2024
  if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
2025
+ all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
2026
 
2027
  if isinstance(all_pages_decision_process_table, pd.DataFrame):
2028
  # Convert decision outputs to list of dataframes:
2029
+ all_pages_decision_process_list = [all_pages_decision_process_table]
2030
 
2031
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
2032
  out_message = "Connection to AWS Comprehend service not found."
 
2153
  page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
2154
 
2155
  if not page_decision_process_table.empty:
2156
+ all_pages_decision_process_list.append(page_decision_process_table)
2157
 
2158
  # Else, user chose not to run redaction
2159
  else:
 
2165
  if not page_text_ocr_outputs.empty:
2166
  page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
2167
  page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
2168
+ all_line_level_ocr_results_list.append(page_text_ocr_outputs)
2169
 
2170
  toc = time.perf_counter()
2171
 
 
2188
  annotations_all_pages.append(page_image_annotations)
2189
 
2190
  # Write logs
2191
+ all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2192
+ all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
2193
 
2194
 
2195
  current_loop_page += 1
 
2213
  progress.close(_tqdm=progress_bar)
2214
 
2215
  # Write logs
2216
+ all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2217
 
2218
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2219
 
2220
  # Write all page outputs
2221
+ all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2222
 
2223
+ #print("all_line_level_ocr_results_list:", all_line_level_ocr_results_list)
2224
 
2225
+ all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
2226
 
2227
  #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
2228
 
tools/find_duplicate_pages.py CHANGED
@@ -1,32 +1,19 @@
1
  import pandas as pd
2
- #import argparse
3
- #import glob
4
  import os
5
  import re
6
  from tools.helper_functions import OUTPUT_FOLDER
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
- # import nltk
10
- # from nltk.corpus import stopwords
11
- # from nltk.tokenize import word_tokenize
12
- # from nltk.stem import PorterStemmer
13
- #import spacy
14
- import numpy as np
15
- import random
16
- import string
17
- from typing import List
18
  from gradio import Progress
19
-
20
- import en_core_web_lg #en_core_web_sm
 
 
21
  nlp = en_core_web_lg.load()
22
- #from tqdm import tqdm
23
-
24
- # nltk.download('punkt')
25
- # nltk.download('stopwords')
26
- # nltk.download('punkt_tab')
27
-
28
- similarity_threshold = 0.9
29
 
 
30
 
31
  def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
32
  """
@@ -96,31 +83,11 @@ def process_data(df:pd.DataFrame, column:str):
96
  def _clean_text(raw_text):
97
  # Remove HTML tags
98
  clean = re.sub(r'<.*?>', '', raw_text)
99
- # clean = re.sub(r'&nbsp;', ' ', clean)
100
- # clean = re.sub(r'\r\n', ' ', clean)
101
- # clean = re.sub(r'&lt;', ' ', clean)
102
- # clean = re.sub(r'&gt;', ' ', clean)
103
- # clean = re.sub(r'<strong>', ' ', clean)
104
- # clean = re.sub(r'</strong>', ' ', clean)
105
-
106
- # Replace non-breaking space \xa0 with a space
107
- # clean = clean.replace(u'\xa0', u' ')
108
- # Remove extra whitespace
109
  clean = ' '.join(clean.split())
110
-
111
- # # Tokenize the text
112
- # words = word_tokenize(clean.lower())
113
-
114
- # # Remove punctuation and numbers
115
- # words = [word for word in words if word.isalpha()]
116
-
117
- # # Remove stopwords
118
- # words = [word for word in words if word not in stop_words]
119
-
120
  # Join the cleaned words back into a string
121
  return clean
122
 
123
- # Function to apply lemmatization and remove stopwords
124
  def _apply_lemmatization(text):
125
  doc = nlp(text)
126
  # Keep only alphabetic tokens and remove stopwords
@@ -133,119 +100,497 @@ def process_data(df:pd.DataFrame, column:str):
133
 
134
  return df
135
 
136
- def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
137
- output_paths = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- progress(0.1, desc="Cleaning input text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- # Load and clean data
142
- df, output_files = combine_ocr_output_text(input_files)
143
- output_paths.extend(output_files)
144
- df = process_data(df, 'text') # Assume this returns 'text_clean', 'file', and 'page' columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Vectorize text
 
 
147
  vectorizer = TfidfVectorizer()
148
- tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
149
 
150
  progress(0.3, desc="Calculating text similarity")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # Compute sparse cosine similarity
153
- similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False) # Keep sparse format
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- # Extract indices of similar pages above threshold
156
- coo_matrix = similarity_matrix.tocoo()
157
- similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
158
 
159
- if similar_pages.size == 0:
160
- return pd.DataFrame(), output_paths # Return empty if no matches
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
 
162
 
 
163
 
164
- # Create a DataFrame for similar pairs
165
- similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
166
-
167
- # Remove duplicate pairs (keep one direction)
168
- similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
169
 
170
- progress(0.8, desc="Mapping back results")
171
- # Map indices to metadata
172
- # index_map = df[['file', 'page', 'text']].to_dict(orient='index')
173
- # similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
174
- # similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
175
- # similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
176
- # similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
177
- # similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
178
- # similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])
179
 
180
- # Create a DataFrame with the metadata
181
- metadata_df = df[['file', 'page', 'text']].reset_index()
 
 
 
 
 
 
 
 
 
182
 
183
- # Merge to get the metadata for Page1
184
- similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
185
- similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
186
 
187
- # Merge to get the metadata for Page2
188
- similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
189
- similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
190
 
191
- # Optionally, drop the index columns if not needed
192
- #similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)
 
 
 
 
 
 
 
 
 
196
 
197
- # Sort results
198
- similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
199
- similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
202
- similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]
 
 
 
 
 
203
 
204
- progress(0.8, desc="Saving output files")
205
 
206
- # Save results
207
- similarity_file_output_path = output_folder + 'page_similarity_results.csv'
208
- similarity_df_out.to_csv(similarity_file_output_path, index=False)
209
- output_paths.append(similarity_file_output_path)
210
 
211
- # Save per-file redaction lists
212
- for redact_file in similarity_df_out['Page2_File'].unique():
213
- output_file_name = output_folder + redact_file + "_whole_page.csv"
214
- whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
215
- whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
216
- output_paths.append(output_file_name)
 
 
 
 
217
 
218
- return similarity_df_out, output_paths
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- # Perturb text
221
- # Apply the perturbation function with a 10% error probability
222
- def perturb_text_with_errors(series:pd.Series):
 
223
 
224
- def _perturb_text(text, error_probability=0.1):
225
- words = text.split() # Split text into words
226
- perturbed_words = []
 
 
 
 
 
 
227
 
228
- for word in words:
229
- if random.random() < error_probability: # Add a random error
230
- perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
231
-
232
- if perturbation_type == 'char_error': # Introduce a character error
233
- idx = random.randint(0, len(word) - 1)
234
- char = random.choice(string.ascii_lowercase) # Add a random letter
235
- word = word[:idx] + char + word[idx:]
236
-
237
- elif perturbation_type == 'extra_space': # Add extra space around a word
238
- word = ' ' + word + ' '
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
- elif perturbation_type == 'extra_punctuation': # Add punctuation to the word
241
- punctuation = random.choice(string.punctuation)
242
- idx = random.randint(0, len(word)) # Insert punctuation randomly
243
- word = word[:idx] + punctuation + word[idx:]
244
 
245
- perturbed_words.append(word)
246
-
247
- return ' '.join(perturbed_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
250
 
251
- return series
 
1
  import pandas as pd
 
 
2
  import os
3
  import re
4
  from tools.helper_functions import OUTPUT_FOLDER
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
+ from typing import List, Tuple
8
+ import gradio as gr
 
 
 
 
 
 
 
9
  from gradio import Progress
10
+ from pathlib import Path
11
+ from pymupdf import Document
12
+ from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe
13
+ import en_core_web_lg
14
  nlp = en_core_web_lg.load()
 
 
 
 
 
 
 
15
 
16
+ similarity_threshold = 0.95
17
 
18
  def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
19
  """
 
83
  def _clean_text(raw_text):
84
  # Remove HTML tags
85
  clean = re.sub(r'<.*?>', '', raw_text)
 
 
 
 
 
 
 
 
 
 
86
  clean = ' '.join(clean.split())
 
 
 
 
 
 
 
 
 
 
87
  # Join the cleaned words back into a string
88
  return clean
89
 
90
+ # Function to apply lemmatisation and remove stopwords
91
  def _apply_lemmatization(text):
92
  doc = nlp(text)
93
  # Keep only alphabetic tokens and remove stopwords
 
100
 
101
  return df
102
 
103
+ def map_metadata_single_page(similarity_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
104
+ """Helper to map metadata for single page results."""
105
+ metadata_df = metadata_source_df[['file', 'page', 'text']]
106
+ results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\
107
+ .rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
108
+ results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\
109
+ .rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
110
+ results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
111
+ final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
112
+ final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"])
113
+ final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
114
+ final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]
115
+ return final_df
116
+
117
+ def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
118
+ """Helper to map metadata for subdocument results."""
119
+ metadata_df = metadata_source_df[['file', 'page', 'text']]
120
 
121
+ subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\
122
+ .rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'})
123
+ subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\
124
+ .rename(columns={'page': 'Page1_End_Page'})
125
+ subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\
126
+ .rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'})
127
+ subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\
128
+ .rename(columns={'page': 'Page2_End_Page'})
129
+
130
+ cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page',
131
+ 'Page2_File', 'Page2_Start_Page', 'Page2_End_Page',
132
+ 'Match_Length', 'Page1_Text', 'Page2_Text']
133
+
134
+ # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
135
+ if 'Avg_Similarity' in subdocument_df.columns:
136
+ subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3)
137
+ cols.insert(7, 'Avg_Similarity')
138
+
139
+ final_df = subdocument_df[cols]
140
+ final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page'])
141
+ final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
142
+ final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]
143
+
144
+ return final_df
145
+
146
+ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str) -> list:
147
+ """
148
+ Saves the main results DataFrame and generates per-file redaction lists.
149
+ This function is extracted to be reusable.
150
+
151
+ Args:
152
+ final_df (pd.DataFrame): The DataFrame containing the final match results.
153
+ output_folder (str): The folder to save the output files.
154
+
155
+ Returns:
156
+ list: A list of paths to all generated files.
157
+ """
158
+ output_paths = []
159
+ output_folder_path = Path(output_folder)
160
+ output_folder_path.mkdir(exist_ok=True)
161
+
162
+ if final_df.empty:
163
+ print("No matches to save.")
164
+ return []
165
+
166
+ # 1. Save the main results DataFrame
167
+ similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
168
+ final_df.to_csv(similarity_file_output_path, index=False)
169
+
170
+ output_paths.append(str(similarity_file_output_path))
171
+ print(f"Main results saved to {similarity_file_output_path}")
172
+
173
+ # 2. Save per-file redaction lists
174
+ # Use 'Page2_File' as the source of duplicate content
175
+ grouping_col = 'Page2_File'
176
+ if grouping_col not in final_df.columns:
177
+ print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
178
+ return output_paths
179
+
180
+ for redact_file, group in final_df.groupby(grouping_col):
181
+ output_file_name_stem = Path(redact_file).stem
182
+ output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
183
+
184
+ all_pages_to_redact = set()
185
+ is_subdocument_match = 'Page2_Start_Page' in group.columns
186
+
187
+ if is_subdocument_match:
188
+ for _, row in group.iterrows():
189
+ pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
190
+ all_pages_to_redact.update(pages_in_range)
191
+ else:
192
+ pages = group['Page2_Page'].unique()
193
+ all_pages_to_redact.update(pages)
194
+
195
+ if all_pages_to_redact:
196
+ redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
197
+ redaction_df.to_csv(output_file_path, header=False, index=False)
198
 
199
+ output_paths.append(str(output_file_path))
200
+ print(f"Redaction list for {redact_file} saved to {output_file_path}")
201
+
202
+ return output_paths
203
+
204
+ def identify_similar_pages(
205
+ df_combined: pd.DataFrame,
206
+ similarity_threshold: float = 0.9,
207
+ min_word_count: int = 10,
208
+ min_consecutive_pages: int = 1,
209
+ greedy_match: bool = False, # NEW parameter
210
+ output_folder: str = OUTPUT_FOLDER,
211
+ progress=Progress(track_tqdm=True)
212
+ ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
213
+ """
214
+ Identifies similar pages with three possible strategies:
215
+ 1. Single Page: If greedy_match=False and min_consecutive_pages=1.
216
+ 2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
217
+ 3. Greedy Consecutive Match: If greedy_match=True.
218
+ """
219
+
220
+ output_paths = []
221
+ progress(0.1, desc="Processing and filtering text")
222
+ df = process_data(df_combined, 'text')
223
+ df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
224
+ original_row_count = len(df)
225
+ df_filtered = df[df['word_count'] >= min_word_count].copy()
226
+ df_filtered.reset_index(drop=True, inplace=True)
227
+
228
+ print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
229
 
230
+ if len(df_filtered) < 2:
231
+ return pd.DataFrame(), [], df_combined
232
+
233
  vectorizer = TfidfVectorizer()
234
+ tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
235
 
236
  progress(0.3, desc="Calculating text similarity")
237
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
238
+ coo_matrix = similarity_matrix.tocoo()
239
+
240
+ # Create a DataFrame of all individual page pairs above the threshold.
241
+ # This is the base for all three matching strategies.
242
+ similar_pages = [
243
+ (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
244
+ if r < c and v >= similarity_threshold
245
+ ]
246
+
247
+ if not similar_pages:
248
+ return pd.DataFrame(), [], df_combined
249
+
250
+ base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
251
 
252
+ progress(0.6, desc="Aggregating results based on matching strategy")
253
+
254
+ if greedy_match:
255
+ print("Finding matches using greedy consecutive strategy.")
256
+
257
+ # A set of pairs for fast lookups of (page1_idx, page2_idx)
258
+ valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
259
+
260
+ # Keep track of indices that have been used in a sequence
261
+ consumed_indices_1 = set()
262
+ consumed_indices_2 = set()
263
+
264
+ all_sequences = []
265
 
266
+ # Iterate through all potential starting pairs, sorted for consistent results
267
+ sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
 
268
 
269
+ for _, row in sorted_pairs.iterrows():
270
+ start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
271
+
272
+ # If this pair has already been consumed by a previous sequence, skip it
273
+ if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
274
+ continue
275
+
276
+ # This is a new sequence, start expanding it
277
+ current_sequence = [(start_idx1, start_idx2)]
278
+ k = 1
279
+ while True:
280
+ next_idx1 = start_idx1 + k
281
+ next_idx2 = start_idx2 + k
282
+
283
+ # Check if the next pair in the sequence is a valid match
284
+ if (next_idx1, next_idx2) in valid_pairs_set and \
285
+ next_idx1 not in consumed_indices_1 and \
286
+ next_idx2 not in consumed_indices_2:
287
+ current_sequence.append((next_idx1, next_idx2))
288
+ k += 1
289
+ else:
290
+ # The sequence has ended
291
+ break
292
+
293
+ # Record the found sequence and mark all its pages as consumed
294
+ sequence_indices_1 = [p[0] for p in current_sequence]
295
+ sequence_indices_2 = [p[1] for p in current_sequence]
296
+
297
+ all_sequences.append({
298
+ 'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
299
+ 'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
300
+ 'Match_Length': len(current_sequence)
301
+ })
302
+
303
+ consumed_indices_1.update(sequence_indices_1)
304
+ consumed_indices_2.update(sequence_indices_2)
305
+
306
+ if not all_sequences:
307
+ return pd.DataFrame(), [], df_combined
308
+
309
+ subdocument_df = pd.DataFrame(all_sequences)
310
+ # We can add back the average similarity if needed, but it requires more lookups.
311
+ # For now, we'll omit it for simplicity in the greedy approach.
312
+ # ... (The rest is metadata mapping, same as the subdocument case)
313
+
314
+ elif min_consecutive_pages > 1:
315
+ # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
316
+ print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
317
+ similarity_df = base_similarity_df.copy()
318
+ similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
319
+ is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
320
+ block_id = is_consecutive.eq(False).cumsum()
321
+ grouped = similarity_df.groupby(block_id)
322
+ agg_results = grouped.agg(
323
+ Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
324
+ Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
325
+ Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
326
+ ).reset_index(drop=True)
327
+ subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
328
+ if subdocument_df.empty: return pd.DataFrame(), [], df_combined
329
+
330
+ else:
331
+ # --- STRATEGY 1: Single Page Matching ---
332
+ print(f"Finding single page matches (min_consecutive_pages=1)")
333
+ final_df = map_metadata_single_page(base_similarity_df, df_filtered)
334
+ # The rest of the logic (saving files) is handled after this if/else block
335
+ pass # The final_df is already prepared
336
+
337
+ # --- Map metadata and format output ---
338
+ # This block now handles the output for both subdocument strategies (2 and 3)
339
+ if greedy_match or min_consecutive_pages > 1:
340
+ final_df = map_metadata_subdocument(subdocument_df, df_filtered)
341
 
342
+ progress(0.8, desc="Saving output files")
343
 
344
+ output_paths = save_results_and_redaction_lists(final_df, output_folder)
345
 
346
+ return final_df, output_paths, df_combined
 
 
 
 
347
 
348
+ # ==============================================================================
349
+ # GRADIO HELPER FUNCTIONS
350
+ # ==============================================================================
 
 
 
 
 
 
351
 
352
+ # full_data:pd.DataFrame,
353
+ def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
354
+ """
355
+ This single function handles a user selecting a row. It:
356
+ 1. Determines the selected row index.
357
+ 2. Calls the show_page_previews function to get the text data.
358
+ 3. Returns all the necessary outputs for the UI.
359
+ """
360
+ # If the user deselects, the event might be None.
361
+ if not evt:
362
+ return None, None, None # Clear state and both preview panes
363
 
364
+ # 1. Get the selected index
365
+ selected_index = evt.index[0]
 
366
 
367
+ # 2. Get the preview data
368
+ page1_data, page2_data = show_page_previews(full_duplicate_data_by_file, results_df, evt)
 
369
 
370
+ # 3. Return all three outputs in the correct order
371
+ return selected_index, page1_data, page2_data
372
 
373
+ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./output/"):
374
+ """
375
+ Removes a selected row from the results DataFrame, regenerates output files,
376
+ and clears the text preview panes.
377
+ """
378
+ if selected_index is None:
379
+ gr.Warning("No match selected. Please click on a row in the table first.")
380
+ # Return the original dataframe and update=False for the files
381
+ return results_df, gr.update(), None, None
382
+
383
+ if results_df.empty:
384
+ gr.Warning("No duplicate page results found, nothing to exclude.")
385
+ return results_df, gr.update(), None, None
386
 
387
+ # Drop the selected row
388
+ updated_df = results_df.drop(selected_index).reset_index(drop=True)
389
+
390
+ # Recalculate all output files using the helper function
391
+ new_output_paths = save_results_and_redaction_lists(updated_df, output_folder)
392
+
393
+ gr.Info(f"Match at row {selected_index} excluded. Output files have been updated.")
394
+
395
+ # Return the updated dataframe, the new file list, and clear the preview panes
396
+ return updated_df, new_output_paths, None, None
397
 
398
+ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
399
+ """
400
+ Wrapper function updated to include the 'greedy_match' boolean.
401
+ """
402
+ if not files:
403
+ gr.Warning("Please upload files to analyze.")
404
+ return None, None, None
405
+
406
+ progress(0, desc="Combining input files...")
407
+ df_combined, _ = combine_ocr_output_text(files)
408
+
409
+ if df_combined.empty:
410
+ gr.Warning("No data found in the uploaded files.")
411
+ return None, None, None
412
+
413
+ # Call the main analysis function with the new parameter
414
+ results_df, output_paths, full_df = identify_similar_pages(
415
+ df_combined=df_combined,
416
+ similarity_threshold=threshold,
417
+ min_word_count=min_words,
418
+ min_consecutive_pages=int(min_consecutive),
419
+ greedy_match=greedy_match,
420
+ progress=progress
421
+ )
422
+
423
+ # Clip text to first 200 characters
424
+ full_df['text'] = full_df['text'].str[:preview_length]
425
+
426
+ # Preprocess full_data (without preview text) for fast access (run once)
427
+ full_data_by_file = {
428
+ file: df.sort_values('page').set_index('page')
429
+ for file, df in full_df.drop(["text_clean"],axis=1).groupby('file')
430
+ }
431
+
432
+ if results_df.empty:
433
+ gr.Info(f"No duplicate pages found, no results returned.")
434
+
435
+ return results_df, output_paths, full_data_by_file # full_df,
436
 
437
+ def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
438
+ """
439
+ Optimized version using pre-partitioned and indexed full_data.
440
+ Triggered when a user selects a row in the results DataFrame.
441
+ """
442
+ if not full_data_by_file or results_df is None or not evt:
443
+ return None, None
444
 
445
+ selected_row = results_df.iloc[evt.index[0], :]
446
 
447
+ is_subdocument_match = 'Page1_Start_Page' in selected_row
 
 
 
448
 
449
+ if is_subdocument_match:
450
+ file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page']
451
+ file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page']
452
+
453
+ page1_data = full_data_by_file[file1].loc[start1:end1, ['text']].reset_index()
454
+ page2_data = full_data_by_file[file2].loc[start2:end2, ['text']].reset_index()
455
+
456
+ else:
457
+ file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page']
458
+ file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page']
459
 
460
+ page1_data = full_data_by_file[file1].loc[[page1], ['text']].reset_index()
461
+ page2_data = full_data_by_file[file2].loc[[page2], ['text']].reset_index()
462
+
463
+ page1_data['text'] = page1_data['text'].str[:preview_length]
464
+ page2_data['text'] = page2_data['text'].str[:preview_length]
465
+
466
+ return page1_data[['page', 'text']], page2_data[['page', 'text']]
467
+
468
+ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df:pd.DataFrame, doc_file_name_with_extension_textbox:str, review_file_state:pd.DataFrame, duplicate_output_paths:list[str], pymupdf_doc:object, page_sizes:list[dict], all_existing_annotations:list[dict]):
469
+ '''
470
+ Take a list of suggested whole pages to redact and apply it to review file data currently available from an existing PDF under review
471
+ '''
472
+ # Create a copy of annotations to avoid modifying the original
473
+ all_annotations = all_existing_annotations.copy()
474
 
475
+ if not pymupdf_doc:
476
+ print("Warning: No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
477
+ raise Warning("No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
478
+ return review_file_state, all_annotations
479
 
480
+ # Initialize list of pages to redact
481
+ list_whole_pages_to_redact = []
482
+
483
+ # Get list of pages to redact from either dataframe or file
484
+ if not duplicate_page_numbers_df.empty:
485
+ list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
486
+ elif duplicate_output_paths:
487
+ expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
488
+ whole_pages_list = pd.DataFrame() # Initialize empty DataFrame
489
 
490
+ for output_file in duplicate_output_paths:
491
+ # Note: output_file.name might not be available if output_file is just a string path
492
+ # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
493
+ file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
494
+ if expected_duplicate_pages_to_redact_name in file_name_from_path:
495
+ whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
496
+ break
497
+
498
+ if not whole_pages_list.empty:
499
+ list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
500
+
501
+ # Convert to set to remove duplicates, then back to list
502
+ list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
503
+
504
+ if not list_whole_pages_to_redact:
505
+ # Assuming gr is defined (e.g., gradio)
506
+ print("No relevant list of whole pages to redact found, returning inputs.")
507
+ raise Warning("Warning: No relevant list of whole pages to redact found, returning inputs.")
508
+ return review_file_state, all_existing_annotations
509
+
510
+ new_annotations = []
511
+
512
+ # Process each page for redaction
513
+ for page in list_whole_pages_to_redact:
514
+ try:
515
+ page_index = int(page) - 1
516
+ if page_index < 0 or page_index >= len(pymupdf_doc):
517
+ print(f"Page {page} is out of bounds for a document with {len(pymupdf_doc)} pages, skipping.")
518
+ continue
519
 
520
+ pymupdf_page = pymupdf_doc[page_index]
521
+
522
+ # Find the matching page size dictionary
523
+ page_size = next((size for size in page_sizes if size["page"] == int(page)), None)
524
 
525
+ if not page_size:
526
+ print(f"Page {page} not found in page_sizes object, skipping.")
527
+ continue
528
+
529
+ rect_height = page_size["cropbox_height"]
530
+ rect_width = page_size["cropbox_width"]
531
+ image = page_size["image_path"] # This `image` likely represents the page identifier
532
+
533
+ # Create the whole page redaction box
534
+ annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, pymupdf_page, border=0.005, redact_pdf=False)
535
+
536
+ # Find existing annotation for this image/page
537
+ current_page_existing_boxes_group = next((annot_group for annot_group in all_annotations if annot_group["image"] == image), None)
538
+
539
+ new_annotation_group = {
540
+ "image": image,
541
+ "boxes": [annotation_box]
542
+ }
543
+
544
+ if current_page_existing_boxes_group:
545
+ # Check if we already have a whole page redaction for this page
546
+ if not any(box["label"] == "Whole page" for box in current_page_existing_boxes_group["boxes"]):
547
+ current_page_existing_boxes_group["boxes"].append(annotation_box)
548
+
549
+ else:
550
+ # Optional: Print a message if a whole-page redaction already exists for this page
551
+ print(f"Whole page redaction for page {page} already exists in annotations, skipping addition.")
552
+ pass
553
+ else: # Create new annotation entry
554
+
555
+ all_annotations.append(new_annotation_group)
556
+
557
+ new_annotations.append(new_annotation_group)
558
+
559
+ except Exception as e:
560
+ print(f"Error processing page {page}: {str(e)}")
561
+ continue
562
+
563
+ # Convert annotations to dataframe and combine with existing review file
564
+ whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
565
+
566
+ # Ensure all required columns are present in both DataFrames before concat
567
+ # This is a common point of error if DFs have different schemas
568
+ expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
569
+
570
+ for col in expected_cols:
571
+ if col not in review_file_state.columns:
572
+ review_file_state[col] = None # Or an appropriate default value
573
+ if col not in whole_page_review_file.columns:
574
+ whole_page_review_file[col] = None
575
+
576
+ review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
577
+ review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"])
578
+
579
+ # --- Remove duplicate entries from the final DataFrame ---
580
+ dedup_subset_cols = ['page', 'label', 'text', 'id']
581
+
582
+ # Ensure these columns exist before trying to use them as subset for drop_duplicates
583
+ if all(col in review_file_out.columns for col in dedup_subset_cols):
584
+ review_file_out = review_file_out.drop_duplicates(
585
+ subset=dedup_subset_cols,
586
+ keep='first' # Keep the first occurrence of a duplicate redaction
587
+ )
588
+ else:
589
+ print(f"Warning: Not all columns required for de-duplication ({dedup_subset_cols}) are present in review_file_out. Skipping specific de-duplication.")
590
+ # You might want a fallback or to inspect what's missing
591
+
592
+ review_file_out.to_csv(OUTPUT_FOLDER + "review_file_out_after_whole_page.csv")
593
 
594
+ gr.Info("Successfully created whole page redactions. Go to the 'Review redactions' tab to see them.")
595
 
596
+ return review_file_out, all_annotations
tools/helper_functions.py CHANGED
@@ -146,6 +146,14 @@ def ensure_output_folder_exists(output_folder:str):
146
  else:
147
  print(f"The {output_folder} folder already exists.")
148
 
 
 
 
 
 
 
 
 
149
  def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
150
  '''
151
  When file is loaded, update the column dropdown choices and write to relevant data states.
 
146
  else:
147
  print(f"The {output_folder} folder already exists.")
148
 
149
+ def _get_env_list(env_var_name: str) -> List[str]:
150
+ """Parses a comma-separated environment variable into a list of strings."""
151
+ value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
152
+ if not value:
153
+ return []
154
+ # Split by comma and filter out any empty strings that might result from extra commas
155
+ return [s.strip() for s in value.split(',') if s.strip()]
156
+
157
  def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
158
  '''
159
  When file is loaded, update the column dropdown choices and write to relevant data states.
tools/redaction_review.py CHANGED
@@ -180,7 +180,7 @@ def update_annotator_page_from_review_df(
180
  ) -> Tuple[object, List[dict], int, List[dict], pd.DataFrame, int]: # Correcting return types based on usage
181
  '''
182
  Update the visible annotation object and related objects with the latest review file information,
183
- optimizing by processing only the current page's data.
184
  '''
185
  # Assume current_image_annotations_state is List[dict] and current_page_annotator is dict
186
  out_image_annotations_state: List[dict] = list(current_image_annotations_state) # Make a copy to avoid modifying input in place
@@ -220,7 +220,6 @@ def update_annotator_page_from_review_df(
220
  else:
221
  print("Warning: Page sizes DataFrame became empty after processing.")
222
 
223
- # --- OPTIMIZATION: Process only the current page's data from review_df ---
224
  if not review_df.empty:
225
  # Filter review_df for the current page
226
  # Ensure 'page' column in review_df is comparable to page_num_reported
@@ -1040,9 +1039,12 @@ def reset_dropdowns(df:pd.DataFrame):
1040
 
1041
  return recogniser_entities_drop, text_entities_drop, page_entities_drop
1042
 
 
 
 
1043
  def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
1044
 
1045
- row_value_page = evt.row_value[0] # This is the page number value
1046
  row_value_label = evt.row_value[1] # This is the label number value
1047
  row_value_text = evt.row_value[2] # This is the text number value
1048
  row_value_id = evt.row_value[3] # This is the text number value
@@ -1072,13 +1074,22 @@ def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
1072
 
1073
  def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
1074
 
1075
- row_value_page = evt.row_value[0] # This is the page_number value
1076
  row_value_text = evt.row_value[1] # This is the text contents
1077
 
1078
  row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
1079
 
1080
  return row_value_page, row_value_df
1081
 
 
 
 
 
 
 
 
 
 
1082
  def get_all_rows_with_same_text(df: pd.DataFrame, text: str):
1083
  '''
1084
  Get all rows with the same text as the selected row
 
180
  ) -> Tuple[object, List[dict], int, List[dict], pd.DataFrame, int]: # Correcting return types based on usage
181
  '''
182
  Update the visible annotation object and related objects with the latest review file information,
183
+ optimising by processing only the current page's data.
184
  '''
185
  # Assume current_image_annotations_state is List[dict] and current_page_annotator is dict
186
  out_image_annotations_state: List[dict] = list(current_image_annotations_state) # Make a copy to avoid modifying input in place
 
220
  else:
221
  print("Warning: Page sizes DataFrame became empty after processing.")
222
 
 
223
  if not review_df.empty:
224
  # Filter review_df for the current page
225
  # Ensure 'page' column in review_df is comparable to page_num_reported
 
1039
 
1040
  return recogniser_entities_drop, text_entities_drop, page_entities_drop
1041
 
1042
+ def increase_bottom_page_count_based_on_top(page_number:int):
1043
+ return int(page_number)
1044
+
1045
  def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
1046
 
1047
+ row_value_page = int(evt.row_value[0]) # This is the page number value
1048
  row_value_label = evt.row_value[1] # This is the label number value
1049
  row_value_text = evt.row_value[2] # This is the text number value
1050
  row_value_id = evt.row_value[3] # This is the text number value
 
1074
 
1075
  def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
1076
 
1077
+ row_value_page = int(evt.row_value[0]) # This is the page_number value
1078
  row_value_text = evt.row_value[1] # This is the text contents
1079
 
1080
  row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
1081
 
1082
  return row_value_page, row_value_df
1083
 
1084
+ # When a user selects a row in the duplicate results table
1085
+ def store_duplicate_selection(evt: gr.SelectData):
1086
+ if not evt.empty:
1087
+ selected_index = evt.index[0]
1088
+ else:
1089
+ selected_index = None
1090
+
1091
+ return selected_index
1092
+
1093
  def get_all_rows_with_same_text(df: pd.DataFrame, text: str):
1094
  '''
1095
  Get all rows with the same text as the selected row