Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Sep 25

Commit

1cb1897

1 Parent(s): 96ac47b

Removed some extraneous test steps. Improved Example loading and feedback, and redaction feedback. Minor security updates. Fixed Adobe xfdf file parsing.

Browse files

Files changed (11) hide show

.coveragerc +56 -0
.github/workflows/ci.yml +13 -9
app.py +180 -106
pyproject.toml +26 -2
requirements.txt +1 -0
tools/config.py +1 -0
tools/data_anonymise.py +3 -0
tools/file_redaction.py +2 -0
tools/load_spacy_model_custom_recognisers.py +1 -1
tools/redaction_review.py +59 -7
tools/secure_regex_utils.py +1 -1

.coveragerc ADDED Viewed

	@@ -0,0 +1,56 @@

+[run]
+source = .
+omit =
+    */tests/*
+    */test/*
+    */__pycache__/*
+    */venv/*
+    */env/*
+    */build/*
+    */dist/*
+    */cdk/*
+    */docs/*
+    */example_data/*
+    */examples/*
+    */feedback/*
+    */logs/*
+    */old_code/*
+    */output/*
+    */tmp/*
+    */usage/*
+    */tld/*
+    */tesseract/*
+    */poppler/*
+    config*.py
+    setup.py
+    lambda_entrypoint.py
+    entrypoint.sh
+    cli_redact.py
+    load_dynamo_logs.py
+    load_s3_logs.py
+    *.spec
+    Dockerfile
+    *.qmd
+    *.md
+    *.txt
+    *.yml
+    *.yaml
+    *.json
+    *.csv
+    *.env
+    *.bat
+    *.ps1
+    *.sh
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod

.github/workflows/ci.yml CHANGED Viewed

@@ -90,6 +90,10 @@ jobs:
       run: |
         python .github/scripts/setup_test_data.py
     - name: Run CLI tests
       run: |
         cd test
@@ -101,16 +105,16 @@ jobs:
     - name: Run tests with coverage
       run: |
-        pytest test/test.py --cov=. --cov-report=xml --cov-report=html --cov-report=term
-    - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v3
-      if: matrix.python-version == '3.11'
-      with:
-        file: ./coverage.xml
-        flags: unittests
-        name: codecov-umbrella
-        fail_ci_if_error: false
     - name: Upload test results
       uses: actions/upload-artifact@v4

       run: |
         python .github/scripts/setup_test_data.py
+    - name: Clean up problematic config files
+      run: |
+        rm -f config*.py || true
     - name: Run CLI tests
       run: |
         cd test
     - name: Run tests with coverage
       run: |
+        pytest test/test.py --cov=. --cov-config=.coveragerc --cov-report=xml --cov-report=html --cov-report=term
+    #- name: Upload coverage to Codecov - not necessary
+    #  uses: codecov/codecov-action@v3
+    #  if: matrix.python-version == '3.11'
+    #  with:
+    #    file: ./coverage.xml
+    #    flags: unittests
+    #    name: codecov-umbrella
+    #    fail_ci_if_error: false
     - name: Upload test results
       uses: actions/upload-artifact@v4

app.py CHANGED Viewed

@@ -93,6 +93,7 @@ from tools.config import (
     SAVE_LOGS_TO_CSV,
     SAVE_LOGS_TO_DYNAMODB,
     SESSION_OUTPUT_FOLDER,
     SHOW_COSTS,
     SHOW_EXAMPLES,
     SHOW_LANGUAGE_SELECTION,
@@ -1008,75 +1009,101 @@ with app:
     ###
     with gr.Tab("Redact PDFs/images"):
-        # Examples for PDF/image redaction
         # Examples for PDF/image redaction
         if SHOW_EXAMPLES == "True":
             gr.Markdown(
                 "### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
             )
             # Check which example files exist and create examples only for available files
             example_files = [
                 "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
-                "example_data/example_complaint_letter.jpg",
                 "example_data/graduate-job-example-cover-letter.pdf",
-                "example_data/Partnership-Agreement-Toolkit_0_0.pdf"
             ]
-            available_examples = []
-            example_labels = []
             # Check each example file and add to examples if it exists
             if os.path.exists(example_files[0]):
-                available_examples.append([
-                    [example_files[0]],
-                    "Local model - selectable text",
-                    "Local",
-                    [],
-                    CHOSEN_REDACT_ENTITIES,
-                    CHOSEN_COMPREHEND_ENTITIES,
-                    [example_files[0]],
-                ])
                 example_labels.append("PDF with selectable text redaction")
             if os.path.exists(example_files[1]):
-                available_examples.append([
-                    [example_files[1]],
-                    "Local OCR model - PDFs without selectable text",
-                    "Local",
-                    [],
-                    CHOSEN_REDACT_ENTITIES,
-                    CHOSEN_COMPREHEND_ENTITIES,
-                    [example_files[1]],
-                ])
                 example_labels.append("Image redaction with local OCR")
             if os.path.exists(example_files[2]):
-                available_examples.append([
-                    [example_files[2]],
-                    "Local OCR model - PDFs without selectable text",
-                    "Local",
-                    [],
-                    ["TITLES", "PERSON", "DATE_TIME"],
-                    CHOSEN_COMPREHEND_ENTITIES,
-                    [example_files[2]],
-                ])
-                example_labels.append("PDF redaction with custom entities (TITLES, PERSON, DATE_TIME)")
             if os.path.exists(example_files[3]):
-                available_examples.append([
-                    [example_files[3]],
-                    "AWS Textract service - all PDF types",
-                    "AWS Comprehend",
-                    ["Extract handwriting", "Extract signatures"],
-                    CHOSEN_REDACT_ENTITIES,
-                    CHOSEN_COMPREHEND_ENTITIES,
-                    [example_files[3]],
-                ])
-                example_labels.append("PDF redaction with AWS services and signature detection")
             # Only create examples if we have available files
             if available_examples:
                 redaction_examples = gr.Examples(
                     examples=available_examples,
                     inputs=[
@@ -1089,6 +1116,8 @@ with app:
                         prepared_pdf_state,
                     ],
                     example_labels=example_labels,
                 )
         with gr.Accordion("Redact document", open=True):
@@ -1664,16 +1693,32 @@ with app:
             "Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab."
         )
         # Examples for duplicate page detection
         if SHOW_EXAMPLES == "True":
             gr.Markdown(
                 "### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
             )
             # Check if duplicate example file exists
-            duplicate_example_file = "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
             if os.path.exists(duplicate_example_file):
                 duplicate_examples = gr.Examples(
                     examples=[
                         [
@@ -1699,6 +1744,8 @@ with app:
                         "Find duplicate pages of text in document OCR outputs",
                         "Find duplicate text lines in document OCR outputs",
                     ],
                 )
         with gr.Accordion("Step 1: Configure and run analysis", open=True):
@@ -1821,7 +1868,7 @@ with app:
     ###
     with gr.Tab(label="Word or Excel/csv files"):
         gr.Markdown(
-            """Choose Word or a tabular data file (xlsx or csv) to redact. Note that when redacting complex Word files with e.g. images, some content/formatting will be removed, and it may not attempt to redact headers. You may prefer to convert the doc file to PDF in Word, and then run it through the first tab of this app (Print to PDF in print settings). Alternatively, an xlsx file output is provided when redacting docx files directly to allow for copying and pasting outputs back into the original document if preferred."""
         )
         # Examples for Word/Excel/csv redaction and tabular duplicate detection
@@ -1829,53 +1876,78 @@ with app:
             gr.Markdown(
                 "### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
             )
             # Check which tabular example files exist
             tabular_example_files = [
                 "example_data/combined_case_notes.csv",
                 "example_data/Bold minimalist professional cover letter.docx",
-                "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
             ]
-            available_tabular_examples = []
-            tabular_example_labels = []
             # Check each tabular example file and add to examples if it exists
             if os.path.exists(tabular_example_files[0]):
-                available_tabular_examples.append([
-                    [tabular_example_files[0]],
-                    ["Case Note", "Client"],
-                    "Local",
-                    "replace with 'REDACTED'",
-                    [tabular_example_files[0]],
-                    ["Case Note"],
-                ])
-                tabular_example_labels.append("CSV file redaction with specific columns - remove text")
             if os.path.exists(tabular_example_files[1]):
-                available_tabular_examples.append([
-                    [tabular_example_files[1]],
-                    [],
-                    "Local",
-                    "replace with 'REDACTED'",
-                    [],
-                    [],
-                ])
-                tabular_example_labels.append("Word document redaction - replace with REDACTED")
             if os.path.exists(tabular_example_files[2]):
-                available_tabular_examples.append([
-                    [tabular_example_files[2]],
-                    ["text"],
-                    "Local",
-                    "replace with 'REDACTED'",
-                    [tabular_example_files[2]],
-                    ["text"],
-                ])
-                tabular_example_labels.append("Tabular duplicate detection in CSV files")
             # Only create examples if we have available files
             if available_tabular_examples:
                 tabular_examples = gr.Examples(
                     examples=available_tabular_examples,
                     inputs=[
@@ -1887,6 +1959,8 @@ with app:
                         tabular_text_columns,
                     ],
                     example_labels=tabular_example_labels,
                 )
         with gr.Accordion("Redact Word or Excel/csv files", open=True):
@@ -2313,7 +2387,7 @@ with app:
     # Recalculate estimated costs based on changes to inputs
     if SHOW_COSTS == "True":
         # Calculate costs
-        total_pdf_page_count.change(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
@@ -2325,7 +2399,7 @@ with app:
             ],
             outputs=[estimated_aws_costs_number],
         )
-        text_extract_method_radio.change(
             fn=check_for_relevant_ocr_output_with_words,
             inputs=[
                 doc_file_name_no_extension_textbox,
@@ -2345,7 +2419,7 @@ with app:
             ],
             outputs=[estimated_aws_costs_number],
         )
-        pii_identification_method_drop.change(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
@@ -2357,7 +2431,7 @@ with app:
             ],
             outputs=[estimated_aws_costs_number],
         )
-        handwrite_signature_checkbox.change(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
@@ -2369,7 +2443,7 @@ with app:
             ],
             outputs=[estimated_aws_costs_number],
         )
-        textract_output_found_checkbox.change(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
@@ -2381,7 +2455,7 @@ with app:
             ],
             outputs=[estimated_aws_costs_number],
         )
-        only_extract_text_radio.change(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
@@ -2393,7 +2467,7 @@ with app:
             ],
             outputs=[estimated_aws_costs_number],
         )
-        textract_output_found_checkbox.change(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
@@ -2407,7 +2481,7 @@ with app:
         )
         # Calculate time taken
-        total_pdf_page_count.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -2419,7 +2493,7 @@ with app:
             ],
             outputs=[estimated_time_taken_number],
         )
-        text_extract_method_radio.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -2431,7 +2505,7 @@ with app:
             ],
             outputs=[estimated_time_taken_number],
         )
-        pii_identification_method_drop.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -2443,7 +2517,7 @@ with app:
             ],
             outputs=[estimated_time_taken_number],
         )
-        handwrite_signature_checkbox.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -2455,7 +2529,7 @@ with app:
             ],
             outputs=[estimated_time_taken_number],
         )
-        textract_output_found_checkbox.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -2468,7 +2542,7 @@ with app:
             ],
             outputs=[estimated_time_taken_number],
         )
-        only_extract_text_radio.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -2480,7 +2554,7 @@ with app:
             ],
             outputs=[estimated_time_taken_number],
         )
-        textract_output_found_checkbox.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -2492,7 +2566,7 @@ with app:
             ],
             outputs=[estimated_time_taken_number],
         )
-        relevant_ocr_output_with_words_found_checkbox.change(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
@@ -5190,6 +5264,7 @@ with app:
             pdf_doc_state,
             images_pdf_state,
             output_folder_textbox,
         ],
         outputs=[input_pdf_for_review],
         scroll_to_output=True,
@@ -6423,7 +6498,6 @@ if __name__ == "__main__":
         # Run the CLI main function with direct mode arguments
         main(direct_mode_args=direct_mode_args)
         # Combine extraction options
         extraction_options = (
             list(direct_mode_args["handwrite_signature_extraction"])

     SAVE_LOGS_TO_CSV,
     SAVE_LOGS_TO_DYNAMODB,
     SESSION_OUTPUT_FOLDER,
+    SHOW_AWS_EXAMPLES,
     SHOW_COSTS,
     SHOW_EXAMPLES,
     SHOW_LANGUAGE_SELECTION,
     ###
     with gr.Tab("Redact PDFs/images"):
         # Examples for PDF/image redaction
         if SHOW_EXAMPLES == "True":
             gr.Markdown(
                 "### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
             )
             # Check which example files exist and create examples only for available files
             example_files = [
                 "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
+                "example_data/example_complaint_letter.jpg",
                 "example_data/graduate-job-example-cover-letter.pdf",
+                "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
             ]
+            available_examples = list()
+            example_labels = list()
             # Check each example file and add to examples if it exists
             if os.path.exists(example_files[0]):
+                available_examples.append(
+                    [
+                        [example_files[0]],
+                        "Local model - selectable text",
+                        "Local",
+                        [],
+                        CHOSEN_REDACT_ENTITIES,
+                        CHOSEN_COMPREHEND_ENTITIES,
+                        [example_files[0]],
+                    ]
+                )
                 example_labels.append("PDF with selectable text redaction")
             if os.path.exists(example_files[1]):
+                available_examples.append(
+                    [
+                        [example_files[1]],
+                        "Local OCR model - PDFs without selectable text",
+                        "Local",
+                        [],
+                        CHOSEN_REDACT_ENTITIES,
+                        CHOSEN_COMPREHEND_ENTITIES,
+                        [example_files[1]],
+                    ]
+                )
                 example_labels.append("Image redaction with local OCR")
             if os.path.exists(example_files[2]):
+                available_examples.append(
+                    [
+                        [example_files[2]],
+                        "Local OCR model - PDFs without selectable text",
+                        "Local",
+                        [],
+                        ["TITLES", "PERSON", "DATE_TIME"],
+                        CHOSEN_COMPREHEND_ENTITIES,
+                        [example_files[2]],
+                    ]
+                )
+                example_labels.append(
+                    "PDF redaction with custom entities (Titles, Person, Dates)"
+                )
             if os.path.exists(example_files[3]):
+                if SHOW_AWS_EXAMPLES == "True":
+                    available_examples.append(
+                        [
+                            [example_files[3]],
+                            "AWS Textract service - all PDF types",
+                            "AWS Comprehend",
+                            ["Extract handwriting", "Extract signatures"],
+                            CHOSEN_REDACT_ENTITIES,
+                            CHOSEN_COMPREHEND_ENTITIES,
+                            [example_files[3]],
+                        ]
+                    )
+                    example_labels.append(
+                        "PDF redaction with AWS services and signature detection"
+                    )
             # Only create examples if we have available files
             if available_examples:
+                def show_info_box_on_click(
+                    in_doc_files,
+                    text_extract_method_radio,
+                    pii_identification_method_drop,
+                    handwrite_signature_checkbox,
+                    in_redact_entities,
+                    in_redact_comprehend_entities,
+                    prepared_pdf_state,
+                ):
+                    gr.Info(
+                        "Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
+                    )
                 redaction_examples = gr.Examples(
                     examples=available_examples,
                     inputs=[
                         prepared_pdf_state,
                     ],
                     example_labels=example_labels,
+                    fn=show_info_box_on_click,
+                    run_on_click=True,
                 )
         with gr.Accordion("Redact document", open=True):
             "Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab."
         )
+        # Examples for duplicate page detection
+        # ... existing code ...
         # Examples for duplicate page detection
         if SHOW_EXAMPLES == "True":
             gr.Markdown(
                 "### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
             )
             # Check if duplicate example file exists
+            duplicate_example_file = (
+                "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
+            )
             if os.path.exists(duplicate_example_file):
+                def show_duplicate_info_box_on_click(
+                    in_duplicate_pages,
+                    duplicate_threshold_input,
+                    min_word_count_input,
+                    combine_page_text_for_duplicates_bool,
+                ):
+                    gr.Info(
+                        "Example data loaded. Now click on 'Identify duplicate pages/subdocuments' below to run the example duplicate detection."
+                    )
                 duplicate_examples = gr.Examples(
                     examples=[
                         [
                         "Find duplicate pages of text in document OCR outputs",
                         "Find duplicate text lines in document OCR outputs",
                     ],
+                    fn=show_duplicate_info_box_on_click,
+                    run_on_click=True,
                 )
         with gr.Accordion("Step 1: Configure and run analysis", open=True):
     ###
     with gr.Tab(label="Word or Excel/csv files"):
         gr.Markdown(
+            """Choose a Word or tabular data file (xlsx or csv) to redact. Note that when redacting complex Word files with e.g. images, some content/formatting will be removed, and it may not attempt to redact headers. You may prefer to convert the doc file to PDF in Word, and then run it through the first tab of this app (Print to PDF in print settings). Alternatively, an xlsx file output is provided when redacting docx files directly to allow for copying and pasting outputs back into the original document if preferred."""
         )
         # Examples for Word/Excel/csv redaction and tabular duplicate detection
             gr.Markdown(
                 "### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
             )
             # Check which tabular example files exist
             tabular_example_files = [
                 "example_data/combined_case_notes.csv",
                 "example_data/Bold minimalist professional cover letter.docx",
+                "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
             ]
+            available_tabular_examples = list()
+            tabular_example_labels = list()
             # Check each tabular example file and add to examples if it exists
             if os.path.exists(tabular_example_files[0]):
+                available_tabular_examples.append(
+                    [
+                        [tabular_example_files[0]],
+                        ["Case Note", "Client"],
+                        "Local",
+                        "replace with 'REDACTED'",
+                        [tabular_example_files[0]],
+                        ["Case Note"],
+                    ]
+                )
+                tabular_example_labels.append(
+                    "CSV file redaction with specific columns - remove text"
+                )
             if os.path.exists(tabular_example_files[1]):
+                available_tabular_examples.append(
+                    [
+                        [tabular_example_files[1]],
+                        [],
+                        "Local",
+                        "replace with 'REDACTED'",
+                        [],
+                        [],
+                    ]
+                )
+                tabular_example_labels.append(
+                    "Word document redaction - replace with REDACTED"
+                )
             if os.path.exists(tabular_example_files[2]):
+                available_tabular_examples.append(
+                    [
+                        [tabular_example_files[2]],
+                        ["text"],
+                        "Local",
+                        "replace with 'REDACTED'",
+                        [tabular_example_files[2]],
+                        ["text"],
+                    ]
+                )
+                tabular_example_labels.append(
+                    "Tabular duplicate detection in CSV files"
+                )
             # Only create examples if we have available files
             if available_tabular_examples:
+                def show_tabular_info_box_on_click(
+                    in_data_files,
+                    in_colnames,
+                    pii_identification_method_drop_tabular,
+                    anon_strategy,
+                    in_tabular_duplicate_files,
+                    tabular_text_columns,
+                ):
+                    gr.Info(
+                        "Example data loaded. Now click on 'Redact text/data files' or 'Find duplicate cells/rows' below to run the example."
+                    )
                 tabular_examples = gr.Examples(
                     examples=available_tabular_examples,
                     inputs=[
                         tabular_text_columns,
                     ],
                     example_labels=tabular_example_labels,
+                    fn=show_tabular_info_box_on_click,
+                    run_on_click=True,
                 )
         with gr.Accordion("Redact Word or Excel/csv files", open=True):
     # Recalculate estimated costs based on changes to inputs
     if SHOW_COSTS == "True":
         # Calculate costs
+        total_pdf_page_count.input(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_aws_costs_number],
         )
+        text_extract_method_radio.input(
             fn=check_for_relevant_ocr_output_with_words,
             inputs=[
                 doc_file_name_no_extension_textbox,
             ],
             outputs=[estimated_aws_costs_number],
         )
+        pii_identification_method_drop.input(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_aws_costs_number],
         )
+        handwrite_signature_checkbox.input(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_aws_costs_number],
         )
+        textract_output_found_checkbox.input(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_aws_costs_number],
         )
+        only_extract_text_radio.input(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_aws_costs_number],
         )
+        textract_output_found_checkbox.input(
             calculate_aws_costs,
             inputs=[
                 total_pdf_page_count,
         )
         # Calculate time taken
+        total_pdf_page_count.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_time_taken_number],
         )
+        text_extract_method_radio.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_time_taken_number],
         )
+        pii_identification_method_drop.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_time_taken_number],
         )
+        handwrite_signature_checkbox.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_time_taken_number],
         )
+        textract_output_found_checkbox.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_time_taken_number],
         )
+        only_extract_text_radio.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_time_taken_number],
         )
+        textract_output_found_checkbox.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             ],
             outputs=[estimated_time_taken_number],
         )
+        relevant_ocr_output_with_words_found_checkbox.input(
             calculate_time_taken,
             inputs=[
                 total_pdf_page_count,
             pdf_doc_state,
             images_pdf_state,
             output_folder_textbox,
+            input_folder_textbox,
         ],
         outputs=[input_pdf_for_review],
         scroll_to_output=True,
         # Run the CLI main function with direct mode arguments
         main(direct_mode_args=direct_mode_args)
         # Combine extraction options
         extraction_options = (
             list(direct_mode_args["handwrite_signature_extraction"])

pyproject.toml CHANGED Viewed

@@ -36,7 +36,8 @@ dependencies = [
     "python-dotenv==1.0.1",
     "awslambdaric==3.1.1",
     "python-docx==1.2.0",
-    "polars==1.33.1"
     #"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
     #"paddleocr==3.2.0"
 ]
@@ -66,4 +67,27 @@ ignore = [
 # Configuration for a Black formatter:
 [tool.black]
 line-length = 88
-target-version = ['py310']

     "python-dotenv==1.0.1",
     "awslambdaric==3.1.1",
     "python-docx==1.2.0",
+    "polars==1.33.1",
+    "defusedxml==0.7.1",
     #"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
     #"paddleocr==3.2.0"
 ]
 # Configuration for a Black formatter:
 [tool.black]
 line-length = 88
+target-version = ['py310']
+# Configuration for pytest:
+[tool.pytest.ini_options]
+filterwarnings = [
+    "ignore::DeprecationWarning:click.parser",
+    "ignore::DeprecationWarning:weasel.util.config",
+    "ignore::DeprecationWarning:builtin type",
+    "ignore::DeprecationWarning:websockets.legacy",
+    "ignore::DeprecationWarning:websockets.server",
+    "ignore::DeprecationWarning:spacy.cli._util",
+    "ignore::DeprecationWarning:weasel.util.config",
+    "ignore::DeprecationWarning:importlib._bootstrap",
+]
+testpaths = ["test"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "-v",
+    "--tb=short",
+    "--strict-markers",
+    "--disable-warnings",
+]

requirements.txt CHANGED Viewed

@@ -23,6 +23,7 @@ rapidfuzz==3.14.1
 python-dotenv==1.0.1
 awslambdaric==3.1.1
 python-docx==1.2.0
 # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
 # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
 # paddleocr==3.2.0

 python-dotenv==1.0.1
 awslambdaric==3.1.1
 python-docx==1.2.0
+defusedxml==0.7.1
 # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
 # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
 # paddleocr==3.2.0

tools/config.py CHANGED Viewed

@@ -544,6 +544,7 @@ except Exception as e:
 COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
 SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
 RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")

 COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
 SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
+SHOW_AWS_EXAMPLES = get_or_create_env_var("SHOW_AWS_EXAMPLES", "False")
 RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")

tools/data_anonymise.py CHANGED Viewed

@@ -588,6 +588,9 @@ def anonymise_files_with_open_text(
         # Set to a very high number so as not to mess with subsequent file processing by the user
         # latest_file_completed = 99
         final_out_message = "\n".join(out_message)
         return (
             final_out_message,
             out_file_paths,

         # Set to a very high number so as not to mess with subsequent file processing by the user
         # latest_file_completed = 99
         final_out_message = "\n".join(out_message)
+        gr.Info(final_out_message)
         return (
             final_out_message,
             out_file_paths,

tools/file_redaction.py CHANGED Viewed

@@ -470,6 +470,8 @@ def choose_and_run_redactor(
         )
         print("Estimated total processing time:", str(estimate_total_processing_time))
         page_break_return = True
         return (

         )
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        gr.Info(combined_out_message)
         page_break_return = True
         return (

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -277,7 +277,7 @@ def download_tesseract_lang_pack(
     # Download the file
     try:
-        response = requests.get(url, stream=True)
         response.raise_for_status()  # Raise an exception for bad status codes
         with open(file_path, "wb") as f:

     # Download the file
     try:
+        response = requests.get(url, stream=True, timeout=60)
         response.raise_for_status()  # Raise an exception for bad status codes
         with open(file_path, "wb") as f:

tools/redaction_review.py CHANGED Viewed

@@ -4,8 +4,14 @@ import string
 import uuid
 from datetime import datetime, timedelta, timezone
 from typing import Dict, List, Set, Tuple
-from xml.dom import minidom
-from xml.etree.ElementTree import Element, SubElement, parse, tostring
 import gradio as gr
 import numpy as np
@@ -617,6 +623,10 @@ def update_annotator_page_from_review_df(
             print(
                 f"Error during image path replacement for page {gradio_annotator_current_page_number}: {e}"
             )
     # Save back page_sizes_df to page_sizes list format
     if not page_sizes_df.empty:
@@ -2652,7 +2662,7 @@ def create_xfdf(
         data_element.text = data_content_string
     rough_string = tostring(xfdf_root, encoding="unicode", method="xml")
-    reparsed = minidom.parseString(rough_string)
     return reparsed.toxml()  # .toprettyxml(indent="  ")
@@ -2793,7 +2803,9 @@ def parse_xfdf(xfdf_path: str):
     Returns:
     - List of dictionaries containing redaction information
     """
-    tree = parse(xfdf_path)
     root = tree.getroot()
     # Define the namespace
@@ -2804,6 +2816,25 @@ def parse_xfdf(xfdf_path: str):
     # Find all redact elements using the namespace
     for redact in root.findall(".//xfdf:redact", namespaces=namespace):
         redaction_info = {
             "image": "",  # Image will be filled in later
             "page": int(redact.get("page")) + 1,  # Convert to 1-based index
@@ -2812,7 +2843,7 @@ def parse_xfdf(xfdf_path: str):
             "xmax": float(redact.get("rect").split(",")[2]),
             "ymax": float(redact.get("rect").split(",")[3]),
             "label": redact.get("title"),
-            "text": redact.get("contents"),
             "color": redact.get(
                 "border-color", "(0, 0, 0)"
             ),  # Default to black if not specified
@@ -2824,9 +2855,10 @@ def parse_xfdf(xfdf_path: str):
 def convert_xfdf_to_dataframe(
     file_paths_list: List[str],
-    pymupdf_doc,
     image_paths: List[str],
     output_folder: str = OUTPUT_FOLDER,
 ):
     """
     Convert redaction annotations from XFDF and associated images into a DataFrame.
@@ -2835,12 +2867,16 @@ def convert_xfdf_to_dataframe(
     - xfdf_path: Path to the XFDF file
     - pdf_doc: PyMuPDF document object
     - image_paths: List of PIL Image objects corresponding to PDF pages
     Returns:
     - DataFrame containing redaction information
     """
     output_paths = list()
     df = pd.DataFrame()
     # Sort the file paths so that the pdfs come first
     file_paths_list = sorted(
@@ -2863,6 +2899,7 @@ def convert_xfdf_to_dataframe(
         if file_path_end == "pdf":
             pdf_name = os.path.basename(file_path)
             # Add pdf to outputs
             output_paths.append(file_path)
@@ -2896,7 +2933,18 @@ def convert_xfdf_to_dataframe(
                 image_path = image_paths[page_python_format]
                 if isinstance(image_path, str):
-                    image = Image.open(image_path)
                 image_page_width, image_page_height = image.size
@@ -2927,4 +2975,8 @@ def convert_xfdf_to_dataframe(
     output_paths.append(out_file_path)
     return output_paths

 import uuid
 from datetime import datetime, timedelta, timezone
 from typing import Dict, List, Set, Tuple
+from xml.etree.ElementTree import Element, SubElement, tostring
+import defusedxml
+import defusedxml.ElementTree as defused_etree
+import defusedxml.minidom as defused_minidom
+# Defuse the standard library XML modules for security
+defusedxml.defuse_stdlib()
 import gradio as gr
 import numpy as np
             print(
                 f"Error during image path replacement for page {gradio_annotator_current_page_number}: {e}"
             )
+    else:
+        print(
+            f"Warning: Page index {page_num_reported_zero_indexed} out of bounds for all_image_annotations list."
+        )
     # Save back page_sizes_df to page_sizes list format
     if not page_sizes_df.empty:
         data_element.text = data_content_string
     rough_string = tostring(xfdf_root, encoding="unicode", method="xml")
+    reparsed = defused_minidom.parseString(rough_string)
     return reparsed.toxml()  # .toprettyxml(indent="  ")
     Returns:
     - List of dictionaries containing redaction information
     """
+    # Assuming xfdf_path is a file path. If you are passing the XML string,
+    # you would use defused_etree.fromstring(xfdf_string) instead of .parse()
+    tree = defused_etree.parse(xfdf_path)
     root = tree.getroot()
     # Define the namespace
     # Find all redact elements using the namespace
     for redact in root.findall(".//xfdf:redact", namespaces=namespace):
+        # Extract text from contents-richtext if it exists
+        text_content = ""
+        # *** THE FIX IS HERE ***
+        # Use the namespace to find the contents-richtext element
+        contents_richtext = redact.find(
+            ".//xfdf:contents-richtext", namespaces=namespace
+        )
+        if contents_richtext is not None:
+            # Get all text content from the HTML structure
+            # The children of contents-richtext (body, p, span) have a different namespace
+            # but itertext() cleverly handles that for us.
+            text_content = "".join(contents_richtext.itertext()).strip()
+        # Fallback to contents attribute if no richtext content
+        if not text_content:
+            text_content = redact.get("contents", "")
         redaction_info = {
             "image": "",  # Image will be filled in later
             "page": int(redact.get("page")) + 1,  # Convert to 1-based index
             "xmax": float(redact.get("rect").split(",")[2]),
             "ymax": float(redact.get("rect").split(",")[3]),
             "label": redact.get("title"),
+            "text": text_content,  # Use the extracted text content
             "color": redact.get(
                 "border-color", "(0, 0, 0)"
             ),  # Default to black if not specified
 def convert_xfdf_to_dataframe(
     file_paths_list: List[str],
+    pymupdf_doc: Document,
     image_paths: List[str],
     output_folder: str = OUTPUT_FOLDER,
+    input_folder: str = INPUT_FOLDER,
 ):
     """
     Convert redaction annotations from XFDF and associated images into a DataFrame.
     - xfdf_path: Path to the XFDF file
     - pdf_doc: PyMuPDF document object
     - image_paths: List of PIL Image objects corresponding to PDF pages
+    - output_folder: Output folder for file save
+    - input_folder: Input folder for image creation
     Returns:
     - DataFrame containing redaction information
     """
     output_paths = list()
     df = pd.DataFrame()
+    pdf_name = ""
+    pdf_path = ""
     # Sort the file paths so that the pdfs come first
     file_paths_list = sorted(
         if file_path_end == "pdf":
             pdf_name = os.path.basename(file_path)
+            pdf_path = file_path
             # Add pdf to outputs
             output_paths.append(file_path)
                 image_path = image_paths[page_python_format]
                 if isinstance(image_path, str):
+                    try:
+                        image = Image.open(image_path)
+                    except Exception:
+                        # print(f"Error opening image: {e}")
+                        page_num, out_path, width, height = (
+                            process_single_page_for_image_conversion(
+                                pdf_path, page_python_format, input_folder=input_folder
+                            )
+                        )
+                        image = Image.open(out_path)
                 image_page_width, image_page_height = image.size
     output_paths.append(out_file_path)
+    gr.Info(
+        f"Review file saved to {out_file_path}. Now click on '1. Upload original pdf' to view the pdf with the annotations."
+    )
     return output_paths

tools/secure_regex_utils.py CHANGED Viewed

@@ -86,7 +86,7 @@ def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
 def safe_extract_page_number_from_path(path: str) -> Optional[int]:
     """
-    Safely extract page number from path containing _(\d+).png pattern.
     Args:
         path: The path to extract page number from

 def safe_extract_page_number_from_path(path: str) -> Optional[int]:
     """
+    Safely extract page number from path containing _(\\d+).png pattern.
     Args:
         path: The path to extract page number from