seanpedrickcase commited on
Commit
1cb1897
·
1 Parent(s): 96ac47b

Removed some extraneous test steps. Improved Example loading and feedback, and redaction feedback. Minor security updates. Fixed Adobe xfdf file parsing.

Browse files
.coveragerc ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [run]
2
+ source = .
3
+ omit =
4
+ */tests/*
5
+ */test/*
6
+ */__pycache__/*
7
+ */venv/*
8
+ */env/*
9
+ */build/*
10
+ */dist/*
11
+ */cdk/*
12
+ */docs/*
13
+ */example_data/*
14
+ */examples/*
15
+ */feedback/*
16
+ */logs/*
17
+ */old_code/*
18
+ */output/*
19
+ */tmp/*
20
+ */usage/*
21
+ */tld/*
22
+ */tesseract/*
23
+ */poppler/*
24
+ config*.py
25
+ setup.py
26
+ lambda_entrypoint.py
27
+ entrypoint.sh
28
+ cli_redact.py
29
+ load_dynamo_logs.py
30
+ load_s3_logs.py
31
+ *.spec
32
+ Dockerfile
33
+ *.qmd
34
+ *.md
35
+ *.txt
36
+ *.yml
37
+ *.yaml
38
+ *.json
39
+ *.csv
40
+ *.env
41
+ *.bat
42
+ *.ps1
43
+ *.sh
44
+
45
+ [report]
46
+ exclude_lines =
47
+ pragma: no cover
48
+ def __repr__
49
+ if self.debug:
50
+ if settings.DEBUG
51
+ raise AssertionError
52
+ raise NotImplementedError
53
+ if 0:
54
+ if __name__ == .__main__.:
55
+ class .*\bProtocol\):
56
+ @(abc\.)?abstractmethod
.github/workflows/ci.yml CHANGED
@@ -90,6 +90,10 @@ jobs:
90
  run: |
91
  python .github/scripts/setup_test_data.py
92
 
 
 
 
 
93
  - name: Run CLI tests
94
  run: |
95
  cd test
@@ -101,16 +105,16 @@ jobs:
101
 
102
  - name: Run tests with coverage
103
  run: |
104
- pytest test/test.py --cov=. --cov-report=xml --cov-report=html --cov-report=term
105
 
106
- - name: Upload coverage to Codecov
107
- uses: codecov/codecov-action@v3
108
- if: matrix.python-version == '3.11'
109
- with:
110
- file: ./coverage.xml
111
- flags: unittests
112
- name: codecov-umbrella
113
- fail_ci_if_error: false
114
 
115
  - name: Upload test results
116
  uses: actions/upload-artifact@v4
 
90
  run: |
91
  python .github/scripts/setup_test_data.py
92
 
93
+ - name: Clean up problematic config files
94
+ run: |
95
+ rm -f config*.py || true
96
+
97
  - name: Run CLI tests
98
  run: |
99
  cd test
 
105
 
106
  - name: Run tests with coverage
107
  run: |
108
+ pytest test/test.py --cov=. --cov-config=.coveragerc --cov-report=xml --cov-report=html --cov-report=term
109
 
110
+ #- name: Upload coverage to Codecov - not necessary
111
+ # uses: codecov/codecov-action@v3
112
+ # if: matrix.python-version == '3.11'
113
+ # with:
114
+ # file: ./coverage.xml
115
+ # flags: unittests
116
+ # name: codecov-umbrella
117
+ # fail_ci_if_error: false
118
 
119
  - name: Upload test results
120
  uses: actions/upload-artifact@v4
app.py CHANGED
@@ -93,6 +93,7 @@ from tools.config import (
93
  SAVE_LOGS_TO_CSV,
94
  SAVE_LOGS_TO_DYNAMODB,
95
  SESSION_OUTPUT_FOLDER,
 
96
  SHOW_COSTS,
97
  SHOW_EXAMPLES,
98
  SHOW_LANGUAGE_SELECTION,
@@ -1008,75 +1009,101 @@ with app:
1008
  ###
1009
  with gr.Tab("Redact PDFs/images"):
1010
 
1011
- # Examples for PDF/image redaction
1012
  # Examples for PDF/image redaction
1013
  if SHOW_EXAMPLES == "True":
1014
  gr.Markdown(
1015
  "### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
1016
  )
1017
-
1018
  # Check which example files exist and create examples only for available files
1019
  example_files = [
1020
  "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
1021
- "example_data/example_complaint_letter.jpg",
1022
  "example_data/graduate-job-example-cover-letter.pdf",
1023
- "example_data/Partnership-Agreement-Toolkit_0_0.pdf"
1024
  ]
1025
-
1026
- available_examples = []
1027
- example_labels = []
1028
-
1029
  # Check each example file and add to examples if it exists
1030
  if os.path.exists(example_files[0]):
1031
- available_examples.append([
1032
- [example_files[0]],
1033
- "Local model - selectable text",
1034
- "Local",
1035
- [],
1036
- CHOSEN_REDACT_ENTITIES,
1037
- CHOSEN_COMPREHEND_ENTITIES,
1038
- [example_files[0]],
1039
- ])
 
 
1040
  example_labels.append("PDF with selectable text redaction")
1041
-
1042
  if os.path.exists(example_files[1]):
1043
- available_examples.append([
1044
- [example_files[1]],
1045
- "Local OCR model - PDFs without selectable text",
1046
- "Local",
1047
- [],
1048
- CHOSEN_REDACT_ENTITIES,
1049
- CHOSEN_COMPREHEND_ENTITIES,
1050
- [example_files[1]],
1051
- ])
 
 
1052
  example_labels.append("Image redaction with local OCR")
1053
-
1054
  if os.path.exists(example_files[2]):
1055
- available_examples.append([
1056
- [example_files[2]],
1057
- "Local OCR model - PDFs without selectable text",
1058
- "Local",
1059
- [],
1060
- ["TITLES", "PERSON", "DATE_TIME"],
1061
- CHOSEN_COMPREHEND_ENTITIES,
1062
- [example_files[2]],
1063
- ])
1064
- example_labels.append("PDF redaction with custom entities (TITLES, PERSON, DATE_TIME)")
1065
-
 
 
 
 
1066
  if os.path.exists(example_files[3]):
1067
- available_examples.append([
1068
- [example_files[3]],
1069
- "AWS Textract service - all PDF types",
1070
- "AWS Comprehend",
1071
- ["Extract handwriting", "Extract signatures"],
1072
- CHOSEN_REDACT_ENTITIES,
1073
- CHOSEN_COMPREHEND_ENTITIES,
1074
- [example_files[3]],
1075
- ])
1076
- example_labels.append("PDF redaction with AWS services and signature detection")
1077
-
 
 
 
 
 
1078
  # Only create examples if we have available files
1079
  if available_examples:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1080
  redaction_examples = gr.Examples(
1081
  examples=available_examples,
1082
  inputs=[
@@ -1089,6 +1116,8 @@ with app:
1089
  prepared_pdf_state,
1090
  ],
1091
  example_labels=example_labels,
 
 
1092
  )
1093
 
1094
  with gr.Accordion("Redact document", open=True):
@@ -1664,16 +1693,32 @@ with app:
1664
  "Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab."
1665
  )
1666
 
 
 
 
1667
  # Examples for duplicate page detection
1668
  if SHOW_EXAMPLES == "True":
1669
  gr.Markdown(
1670
  "### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
1671
  )
1672
-
1673
  # Check if duplicate example file exists
1674
- duplicate_example_file = "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
1675
-
 
 
1676
  if os.path.exists(duplicate_example_file):
 
 
 
 
 
 
 
 
 
 
 
1677
  duplicate_examples = gr.Examples(
1678
  examples=[
1679
  [
@@ -1699,6 +1744,8 @@ with app:
1699
  "Find duplicate pages of text in document OCR outputs",
1700
  "Find duplicate text lines in document OCR outputs",
1701
  ],
 
 
1702
  )
1703
 
1704
  with gr.Accordion("Step 1: Configure and run analysis", open=True):
@@ -1821,7 +1868,7 @@ with app:
1821
  ###
1822
  with gr.Tab(label="Word or Excel/csv files"):
1823
  gr.Markdown(
1824
- """Choose Word or a tabular data file (xlsx or csv) to redact. Note that when redacting complex Word files with e.g. images, some content/formatting will be removed, and it may not attempt to redact headers. You may prefer to convert the doc file to PDF in Word, and then run it through the first tab of this app (Print to PDF in print settings). Alternatively, an xlsx file output is provided when redacting docx files directly to allow for copying and pasting outputs back into the original document if preferred."""
1825
  )
1826
 
1827
  # Examples for Word/Excel/csv redaction and tabular duplicate detection
@@ -1829,53 +1876,78 @@ with app:
1829
  gr.Markdown(
1830
  "### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
1831
  )
1832
-
1833
  # Check which tabular example files exist
1834
  tabular_example_files = [
1835
  "example_data/combined_case_notes.csv",
1836
  "example_data/Bold minimalist professional cover letter.docx",
1837
- "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
1838
  ]
1839
-
1840
- available_tabular_examples = []
1841
- tabular_example_labels = []
1842
-
1843
  # Check each tabular example file and add to examples if it exists
1844
  if os.path.exists(tabular_example_files[0]):
1845
- available_tabular_examples.append([
1846
- [tabular_example_files[0]],
1847
- ["Case Note", "Client"],
1848
- "Local",
1849
- "replace with 'REDACTED'",
1850
- [tabular_example_files[0]],
1851
- ["Case Note"],
1852
- ])
1853
- tabular_example_labels.append("CSV file redaction with specific columns - remove text")
1854
-
 
 
 
 
1855
  if os.path.exists(tabular_example_files[1]):
1856
- available_tabular_examples.append([
1857
- [tabular_example_files[1]],
1858
- [],
1859
- "Local",
1860
- "replace with 'REDACTED'",
1861
- [],
1862
- [],
1863
- ])
1864
- tabular_example_labels.append("Word document redaction - replace with REDACTED")
1865
-
 
 
 
 
1866
  if os.path.exists(tabular_example_files[2]):
1867
- available_tabular_examples.append([
1868
- [tabular_example_files[2]],
1869
- ["text"],
1870
- "Local",
1871
- "replace with 'REDACTED'",
1872
- [tabular_example_files[2]],
1873
- ["text"],
1874
- ])
1875
- tabular_example_labels.append("Tabular duplicate detection in CSV files")
1876
-
 
 
 
 
1877
  # Only create examples if we have available files
1878
  if available_tabular_examples:
 
 
 
 
 
 
 
 
 
 
 
 
 
1879
  tabular_examples = gr.Examples(
1880
  examples=available_tabular_examples,
1881
  inputs=[
@@ -1887,6 +1959,8 @@ with app:
1887
  tabular_text_columns,
1888
  ],
1889
  example_labels=tabular_example_labels,
 
 
1890
  )
1891
 
1892
  with gr.Accordion("Redact Word or Excel/csv files", open=True):
@@ -2313,7 +2387,7 @@ with app:
2313
  # Recalculate estimated costs based on changes to inputs
2314
  if SHOW_COSTS == "True":
2315
  # Calculate costs
2316
- total_pdf_page_count.change(
2317
  calculate_aws_costs,
2318
  inputs=[
2319
  total_pdf_page_count,
@@ -2325,7 +2399,7 @@ with app:
2325
  ],
2326
  outputs=[estimated_aws_costs_number],
2327
  )
2328
- text_extract_method_radio.change(
2329
  fn=check_for_relevant_ocr_output_with_words,
2330
  inputs=[
2331
  doc_file_name_no_extension_textbox,
@@ -2345,7 +2419,7 @@ with app:
2345
  ],
2346
  outputs=[estimated_aws_costs_number],
2347
  )
2348
- pii_identification_method_drop.change(
2349
  calculate_aws_costs,
2350
  inputs=[
2351
  total_pdf_page_count,
@@ -2357,7 +2431,7 @@ with app:
2357
  ],
2358
  outputs=[estimated_aws_costs_number],
2359
  )
2360
- handwrite_signature_checkbox.change(
2361
  calculate_aws_costs,
2362
  inputs=[
2363
  total_pdf_page_count,
@@ -2369,7 +2443,7 @@ with app:
2369
  ],
2370
  outputs=[estimated_aws_costs_number],
2371
  )
2372
- textract_output_found_checkbox.change(
2373
  calculate_aws_costs,
2374
  inputs=[
2375
  total_pdf_page_count,
@@ -2381,7 +2455,7 @@ with app:
2381
  ],
2382
  outputs=[estimated_aws_costs_number],
2383
  )
2384
- only_extract_text_radio.change(
2385
  calculate_aws_costs,
2386
  inputs=[
2387
  total_pdf_page_count,
@@ -2393,7 +2467,7 @@ with app:
2393
  ],
2394
  outputs=[estimated_aws_costs_number],
2395
  )
2396
- textract_output_found_checkbox.change(
2397
  calculate_aws_costs,
2398
  inputs=[
2399
  total_pdf_page_count,
@@ -2407,7 +2481,7 @@ with app:
2407
  )
2408
 
2409
  # Calculate time taken
2410
- total_pdf_page_count.change(
2411
  calculate_time_taken,
2412
  inputs=[
2413
  total_pdf_page_count,
@@ -2419,7 +2493,7 @@ with app:
2419
  ],
2420
  outputs=[estimated_time_taken_number],
2421
  )
2422
- text_extract_method_radio.change(
2423
  calculate_time_taken,
2424
  inputs=[
2425
  total_pdf_page_count,
@@ -2431,7 +2505,7 @@ with app:
2431
  ],
2432
  outputs=[estimated_time_taken_number],
2433
  )
2434
- pii_identification_method_drop.change(
2435
  calculate_time_taken,
2436
  inputs=[
2437
  total_pdf_page_count,
@@ -2443,7 +2517,7 @@ with app:
2443
  ],
2444
  outputs=[estimated_time_taken_number],
2445
  )
2446
- handwrite_signature_checkbox.change(
2447
  calculate_time_taken,
2448
  inputs=[
2449
  total_pdf_page_count,
@@ -2455,7 +2529,7 @@ with app:
2455
  ],
2456
  outputs=[estimated_time_taken_number],
2457
  )
2458
- textract_output_found_checkbox.change(
2459
  calculate_time_taken,
2460
  inputs=[
2461
  total_pdf_page_count,
@@ -2468,7 +2542,7 @@ with app:
2468
  ],
2469
  outputs=[estimated_time_taken_number],
2470
  )
2471
- only_extract_text_radio.change(
2472
  calculate_time_taken,
2473
  inputs=[
2474
  total_pdf_page_count,
@@ -2480,7 +2554,7 @@ with app:
2480
  ],
2481
  outputs=[estimated_time_taken_number],
2482
  )
2483
- textract_output_found_checkbox.change(
2484
  calculate_time_taken,
2485
  inputs=[
2486
  total_pdf_page_count,
@@ -2492,7 +2566,7 @@ with app:
2492
  ],
2493
  outputs=[estimated_time_taken_number],
2494
  )
2495
- relevant_ocr_output_with_words_found_checkbox.change(
2496
  calculate_time_taken,
2497
  inputs=[
2498
  total_pdf_page_count,
@@ -5190,6 +5264,7 @@ with app:
5190
  pdf_doc_state,
5191
  images_pdf_state,
5192
  output_folder_textbox,
 
5193
  ],
5194
  outputs=[input_pdf_for_review],
5195
  scroll_to_output=True,
@@ -6423,7 +6498,6 @@ if __name__ == "__main__":
6423
  # Run the CLI main function with direct mode arguments
6424
  main(direct_mode_args=direct_mode_args)
6425
 
6426
-
6427
  # Combine extraction options
6428
  extraction_options = (
6429
  list(direct_mode_args["handwrite_signature_extraction"])
 
93
  SAVE_LOGS_TO_CSV,
94
  SAVE_LOGS_TO_DYNAMODB,
95
  SESSION_OUTPUT_FOLDER,
96
+ SHOW_AWS_EXAMPLES,
97
  SHOW_COSTS,
98
  SHOW_EXAMPLES,
99
  SHOW_LANGUAGE_SELECTION,
 
1009
  ###
1010
  with gr.Tab("Redact PDFs/images"):
1011
 
 
1012
  # Examples for PDF/image redaction
1013
  if SHOW_EXAMPLES == "True":
1014
  gr.Markdown(
1015
  "### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
1016
  )
1017
+
1018
  # Check which example files exist and create examples only for available files
1019
  example_files = [
1020
  "example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
1021
+ "example_data/example_complaint_letter.jpg",
1022
  "example_data/graduate-job-example-cover-letter.pdf",
1023
+ "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
1024
  ]
1025
+
1026
+ available_examples = list()
1027
+ example_labels = list()
1028
+
1029
  # Check each example file and add to examples if it exists
1030
  if os.path.exists(example_files[0]):
1031
+ available_examples.append(
1032
+ [
1033
+ [example_files[0]],
1034
+ "Local model - selectable text",
1035
+ "Local",
1036
+ [],
1037
+ CHOSEN_REDACT_ENTITIES,
1038
+ CHOSEN_COMPREHEND_ENTITIES,
1039
+ [example_files[0]],
1040
+ ]
1041
+ )
1042
  example_labels.append("PDF with selectable text redaction")
1043
+
1044
  if os.path.exists(example_files[1]):
1045
+ available_examples.append(
1046
+ [
1047
+ [example_files[1]],
1048
+ "Local OCR model - PDFs without selectable text",
1049
+ "Local",
1050
+ [],
1051
+ CHOSEN_REDACT_ENTITIES,
1052
+ CHOSEN_COMPREHEND_ENTITIES,
1053
+ [example_files[1]],
1054
+ ]
1055
+ )
1056
  example_labels.append("Image redaction with local OCR")
1057
+
1058
  if os.path.exists(example_files[2]):
1059
+ available_examples.append(
1060
+ [
1061
+ [example_files[2]],
1062
+ "Local OCR model - PDFs without selectable text",
1063
+ "Local",
1064
+ [],
1065
+ ["TITLES", "PERSON", "DATE_TIME"],
1066
+ CHOSEN_COMPREHEND_ENTITIES,
1067
+ [example_files[2]],
1068
+ ]
1069
+ )
1070
+ example_labels.append(
1071
+ "PDF redaction with custom entities (Titles, Person, Dates)"
1072
+ )
1073
+
1074
  if os.path.exists(example_files[3]):
1075
+ if SHOW_AWS_EXAMPLES == "True":
1076
+ available_examples.append(
1077
+ [
1078
+ [example_files[3]],
1079
+ "AWS Textract service - all PDF types",
1080
+ "AWS Comprehend",
1081
+ ["Extract handwriting", "Extract signatures"],
1082
+ CHOSEN_REDACT_ENTITIES,
1083
+ CHOSEN_COMPREHEND_ENTITIES,
1084
+ [example_files[3]],
1085
+ ]
1086
+ )
1087
+ example_labels.append(
1088
+ "PDF redaction with AWS services and signature detection"
1089
+ )
1090
+
1091
  # Only create examples if we have available files
1092
  if available_examples:
1093
+
1094
+ def show_info_box_on_click(
1095
+ in_doc_files,
1096
+ text_extract_method_radio,
1097
+ pii_identification_method_drop,
1098
+ handwrite_signature_checkbox,
1099
+ in_redact_entities,
1100
+ in_redact_comprehend_entities,
1101
+ prepared_pdf_state,
1102
+ ):
1103
+ gr.Info(
1104
+ "Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
1105
+ )
1106
+
1107
  redaction_examples = gr.Examples(
1108
  examples=available_examples,
1109
  inputs=[
 
1116
  prepared_pdf_state,
1117
  ],
1118
  example_labels=example_labels,
1119
+ fn=show_info_box_on_click,
1120
+ run_on_click=True,
1121
  )
1122
 
1123
  with gr.Accordion("Redact document", open=True):
 
1693
  "Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab."
1694
  )
1695
 
1696
+ # Examples for duplicate page detection
1697
+ # ... existing code ...
1698
+
1699
  # Examples for duplicate page detection
1700
  if SHOW_EXAMPLES == "True":
1701
  gr.Markdown(
1702
  "### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
1703
  )
1704
+
1705
  # Check if duplicate example file exists
1706
+ duplicate_example_file = (
1707
+ "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
1708
+ )
1709
+
1710
  if os.path.exists(duplicate_example_file):
1711
+
1712
+ def show_duplicate_info_box_on_click(
1713
+ in_duplicate_pages,
1714
+ duplicate_threshold_input,
1715
+ min_word_count_input,
1716
+ combine_page_text_for_duplicates_bool,
1717
+ ):
1718
+ gr.Info(
1719
+ "Example data loaded. Now click on 'Identify duplicate pages/subdocuments' below to run the example duplicate detection."
1720
+ )
1721
+
1722
  duplicate_examples = gr.Examples(
1723
  examples=[
1724
  [
 
1744
  "Find duplicate pages of text in document OCR outputs",
1745
  "Find duplicate text lines in document OCR outputs",
1746
  ],
1747
+ fn=show_duplicate_info_box_on_click,
1748
+ run_on_click=True,
1749
  )
1750
 
1751
  with gr.Accordion("Step 1: Configure and run analysis", open=True):
 
1868
  ###
1869
  with gr.Tab(label="Word or Excel/csv files"):
1870
  gr.Markdown(
1871
+ """Choose a Word or tabular data file (xlsx or csv) to redact. Note that when redacting complex Word files with e.g. images, some content/formatting will be removed, and it may not attempt to redact headers. You may prefer to convert the doc file to PDF in Word, and then run it through the first tab of this app (Print to PDF in print settings). Alternatively, an xlsx file output is provided when redacting docx files directly to allow for copying and pasting outputs back into the original document if preferred."""
1872
  )
1873
 
1874
  # Examples for Word/Excel/csv redaction and tabular duplicate detection
 
1876
  gr.Markdown(
1877
  "### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
1878
  )
1879
+
1880
  # Check which tabular example files exist
1881
  tabular_example_files = [
1882
  "example_data/combined_case_notes.csv",
1883
  "example_data/Bold minimalist professional cover letter.docx",
1884
+ "example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
1885
  ]
1886
+
1887
+ available_tabular_examples = list()
1888
+ tabular_example_labels = list()
1889
+
1890
  # Check each tabular example file and add to examples if it exists
1891
  if os.path.exists(tabular_example_files[0]):
1892
+ available_tabular_examples.append(
1893
+ [
1894
+ [tabular_example_files[0]],
1895
+ ["Case Note", "Client"],
1896
+ "Local",
1897
+ "replace with 'REDACTED'",
1898
+ [tabular_example_files[0]],
1899
+ ["Case Note"],
1900
+ ]
1901
+ )
1902
+ tabular_example_labels.append(
1903
+ "CSV file redaction with specific columns - remove text"
1904
+ )
1905
+
1906
  if os.path.exists(tabular_example_files[1]):
1907
+ available_tabular_examples.append(
1908
+ [
1909
+ [tabular_example_files[1]],
1910
+ [],
1911
+ "Local",
1912
+ "replace with 'REDACTED'",
1913
+ [],
1914
+ [],
1915
+ ]
1916
+ )
1917
+ tabular_example_labels.append(
1918
+ "Word document redaction - replace with REDACTED"
1919
+ )
1920
+
1921
  if os.path.exists(tabular_example_files[2]):
1922
+ available_tabular_examples.append(
1923
+ [
1924
+ [tabular_example_files[2]],
1925
+ ["text"],
1926
+ "Local",
1927
+ "replace with 'REDACTED'",
1928
+ [tabular_example_files[2]],
1929
+ ["text"],
1930
+ ]
1931
+ )
1932
+ tabular_example_labels.append(
1933
+ "Tabular duplicate detection in CSV files"
1934
+ )
1935
+
1936
  # Only create examples if we have available files
1937
  if available_tabular_examples:
1938
+
1939
+ def show_tabular_info_box_on_click(
1940
+ in_data_files,
1941
+ in_colnames,
1942
+ pii_identification_method_drop_tabular,
1943
+ anon_strategy,
1944
+ in_tabular_duplicate_files,
1945
+ tabular_text_columns,
1946
+ ):
1947
+ gr.Info(
1948
+ "Example data loaded. Now click on 'Redact text/data files' or 'Find duplicate cells/rows' below to run the example."
1949
+ )
1950
+
1951
  tabular_examples = gr.Examples(
1952
  examples=available_tabular_examples,
1953
  inputs=[
 
1959
  tabular_text_columns,
1960
  ],
1961
  example_labels=tabular_example_labels,
1962
+ fn=show_tabular_info_box_on_click,
1963
+ run_on_click=True,
1964
  )
1965
 
1966
  with gr.Accordion("Redact Word or Excel/csv files", open=True):
 
2387
  # Recalculate estimated costs based on changes to inputs
2388
  if SHOW_COSTS == "True":
2389
  # Calculate costs
2390
+ total_pdf_page_count.input(
2391
  calculate_aws_costs,
2392
  inputs=[
2393
  total_pdf_page_count,
 
2399
  ],
2400
  outputs=[estimated_aws_costs_number],
2401
  )
2402
+ text_extract_method_radio.input(
2403
  fn=check_for_relevant_ocr_output_with_words,
2404
  inputs=[
2405
  doc_file_name_no_extension_textbox,
 
2419
  ],
2420
  outputs=[estimated_aws_costs_number],
2421
  )
2422
+ pii_identification_method_drop.input(
2423
  calculate_aws_costs,
2424
  inputs=[
2425
  total_pdf_page_count,
 
2431
  ],
2432
  outputs=[estimated_aws_costs_number],
2433
  )
2434
+ handwrite_signature_checkbox.input(
2435
  calculate_aws_costs,
2436
  inputs=[
2437
  total_pdf_page_count,
 
2443
  ],
2444
  outputs=[estimated_aws_costs_number],
2445
  )
2446
+ textract_output_found_checkbox.input(
2447
  calculate_aws_costs,
2448
  inputs=[
2449
  total_pdf_page_count,
 
2455
  ],
2456
  outputs=[estimated_aws_costs_number],
2457
  )
2458
+ only_extract_text_radio.input(
2459
  calculate_aws_costs,
2460
  inputs=[
2461
  total_pdf_page_count,
 
2467
  ],
2468
  outputs=[estimated_aws_costs_number],
2469
  )
2470
+ textract_output_found_checkbox.input(
2471
  calculate_aws_costs,
2472
  inputs=[
2473
  total_pdf_page_count,
 
2481
  )
2482
 
2483
  # Calculate time taken
2484
+ total_pdf_page_count.input(
2485
  calculate_time_taken,
2486
  inputs=[
2487
  total_pdf_page_count,
 
2493
  ],
2494
  outputs=[estimated_time_taken_number],
2495
  )
2496
+ text_extract_method_radio.input(
2497
  calculate_time_taken,
2498
  inputs=[
2499
  total_pdf_page_count,
 
2505
  ],
2506
  outputs=[estimated_time_taken_number],
2507
  )
2508
+ pii_identification_method_drop.input(
2509
  calculate_time_taken,
2510
  inputs=[
2511
  total_pdf_page_count,
 
2517
  ],
2518
  outputs=[estimated_time_taken_number],
2519
  )
2520
+ handwrite_signature_checkbox.input(
2521
  calculate_time_taken,
2522
  inputs=[
2523
  total_pdf_page_count,
 
2529
  ],
2530
  outputs=[estimated_time_taken_number],
2531
  )
2532
+ textract_output_found_checkbox.input(
2533
  calculate_time_taken,
2534
  inputs=[
2535
  total_pdf_page_count,
 
2542
  ],
2543
  outputs=[estimated_time_taken_number],
2544
  )
2545
+ only_extract_text_radio.input(
2546
  calculate_time_taken,
2547
  inputs=[
2548
  total_pdf_page_count,
 
2554
  ],
2555
  outputs=[estimated_time_taken_number],
2556
  )
2557
+ textract_output_found_checkbox.input(
2558
  calculate_time_taken,
2559
  inputs=[
2560
  total_pdf_page_count,
 
2566
  ],
2567
  outputs=[estimated_time_taken_number],
2568
  )
2569
+ relevant_ocr_output_with_words_found_checkbox.input(
2570
  calculate_time_taken,
2571
  inputs=[
2572
  total_pdf_page_count,
 
5264
  pdf_doc_state,
5265
  images_pdf_state,
5266
  output_folder_textbox,
5267
+ input_folder_textbox,
5268
  ],
5269
  outputs=[input_pdf_for_review],
5270
  scroll_to_output=True,
 
6498
  # Run the CLI main function with direct mode arguments
6499
  main(direct_mode_args=direct_mode_args)
6500
 
 
6501
  # Combine extraction options
6502
  extraction_options = (
6503
  list(direct_mode_args["handwrite_signature_extraction"])
pyproject.toml CHANGED
@@ -36,7 +36,8 @@ dependencies = [
36
  "python-dotenv==1.0.1",
37
  "awslambdaric==3.1.1",
38
  "python-docx==1.2.0",
39
- "polars==1.33.1"
 
40
  #"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
41
  #"paddleocr==3.2.0"
42
  ]
@@ -66,4 +67,27 @@ ignore = [
66
  # Configuration for a Black formatter:
67
  [tool.black]
68
  line-length = 88
69
- target-version = ['py310']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "python-dotenv==1.0.1",
37
  "awslambdaric==3.1.1",
38
  "python-docx==1.2.0",
39
+ "polars==1.33.1",
40
+ "defusedxml==0.7.1",
41
  #"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
42
  #"paddleocr==3.2.0"
43
  ]
 
67
  # Configuration for a Black formatter:
68
  [tool.black]
69
  line-length = 88
70
+ target-version = ['py310']
71
+
72
+ # Configuration for pytest:
73
+ [tool.pytest.ini_options]
74
+ filterwarnings = [
75
+ "ignore::DeprecationWarning:click.parser",
76
+ "ignore::DeprecationWarning:weasel.util.config",
77
+ "ignore::DeprecationWarning:builtin type",
78
+ "ignore::DeprecationWarning:websockets.legacy",
79
+ "ignore::DeprecationWarning:websockets.server",
80
+ "ignore::DeprecationWarning:spacy.cli._util",
81
+ "ignore::DeprecationWarning:weasel.util.config",
82
+ "ignore::DeprecationWarning:importlib._bootstrap",
83
+ ]
84
+ testpaths = ["test"]
85
+ python_files = ["test_*.py", "*_test.py"]
86
+ python_classes = ["Test*"]
87
+ python_functions = ["test_*"]
88
+ addopts = [
89
+ "-v",
90
+ "--tb=short",
91
+ "--strict-markers",
92
+ "--disable-warnings",
93
+ ]
requirements.txt CHANGED
@@ -23,6 +23,7 @@ rapidfuzz==3.14.1
23
  python-dotenv==1.0.1
24
  awslambdaric==3.1.1
25
  python-docx==1.2.0
 
26
  # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
27
  # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
28
  # paddleocr==3.2.0
 
23
  python-dotenv==1.0.1
24
  awslambdaric==3.1.1
25
  python-docx==1.2.0
26
+ defusedxml==0.7.1
27
  # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
28
  # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
29
  # paddleocr==3.2.0
tools/config.py CHANGED
@@ -544,6 +544,7 @@ except Exception as e:
544
  COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
545
 
546
  SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
 
547
 
548
  RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
549
 
 
544
  COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
545
 
546
  SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
547
+ SHOW_AWS_EXAMPLES = get_or_create_env_var("SHOW_AWS_EXAMPLES", "False")
548
 
549
  RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
550
 
tools/data_anonymise.py CHANGED
@@ -588,6 +588,9 @@ def anonymise_files_with_open_text(
588
  # Set to a very high number so as not to mess with subsequent file processing by the user
589
  # latest_file_completed = 99
590
  final_out_message = "\n".join(out_message)
 
 
 
591
  return (
592
  final_out_message,
593
  out_file_paths,
 
588
  # Set to a very high number so as not to mess with subsequent file processing by the user
589
  # latest_file_completed = 99
590
  final_out_message = "\n".join(out_message)
591
+
592
+ gr.Info(final_out_message)
593
+
594
  return (
595
  final_out_message,
596
  out_file_paths,
tools/file_redaction.py CHANGED
@@ -470,6 +470,8 @@ def choose_and_run_redactor(
470
  )
471
  print("Estimated total processing time:", str(estimate_total_processing_time))
472
 
 
 
473
  page_break_return = True
474
 
475
  return (
 
470
  )
471
  print("Estimated total processing time:", str(estimate_total_processing_time))
472
 
473
+ gr.Info(combined_out_message)
474
+
475
  page_break_return = True
476
 
477
  return (
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -277,7 +277,7 @@ def download_tesseract_lang_pack(
277
 
278
  # Download the file
279
  try:
280
- response = requests.get(url, stream=True)
281
  response.raise_for_status() # Raise an exception for bad status codes
282
 
283
  with open(file_path, "wb") as f:
 
277
 
278
  # Download the file
279
  try:
280
+ response = requests.get(url, stream=True, timeout=60)
281
  response.raise_for_status() # Raise an exception for bad status codes
282
 
283
  with open(file_path, "wb") as f:
tools/redaction_review.py CHANGED
@@ -4,8 +4,14 @@ import string
4
  import uuid
5
  from datetime import datetime, timedelta, timezone
6
  from typing import Dict, List, Set, Tuple
7
- from xml.dom import minidom
8
- from xml.etree.ElementTree import Element, SubElement, parse, tostring
 
 
 
 
 
 
9
 
10
  import gradio as gr
11
  import numpy as np
@@ -617,6 +623,10 @@ def update_annotator_page_from_review_df(
617
  print(
618
  f"Error during image path replacement for page {gradio_annotator_current_page_number}: {e}"
619
  )
 
 
 
 
620
 
621
  # Save back page_sizes_df to page_sizes list format
622
  if not page_sizes_df.empty:
@@ -2652,7 +2662,7 @@ def create_xfdf(
2652
  data_element.text = data_content_string
2653
 
2654
  rough_string = tostring(xfdf_root, encoding="unicode", method="xml")
2655
- reparsed = minidom.parseString(rough_string)
2656
  return reparsed.toxml() # .toprettyxml(indent=" ")
2657
 
2658
 
@@ -2793,7 +2803,9 @@ def parse_xfdf(xfdf_path: str):
2793
  Returns:
2794
  - List of dictionaries containing redaction information
2795
  """
2796
- tree = parse(xfdf_path)
 
 
2797
  root = tree.getroot()
2798
 
2799
  # Define the namespace
@@ -2804,6 +2816,25 @@ def parse_xfdf(xfdf_path: str):
2804
  # Find all redact elements using the namespace
2805
  for redact in root.findall(".//xfdf:redact", namespaces=namespace):
2806
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2807
  redaction_info = {
2808
  "image": "", # Image will be filled in later
2809
  "page": int(redact.get("page")) + 1, # Convert to 1-based index
@@ -2812,7 +2843,7 @@ def parse_xfdf(xfdf_path: str):
2812
  "xmax": float(redact.get("rect").split(",")[2]),
2813
  "ymax": float(redact.get("rect").split(",")[3]),
2814
  "label": redact.get("title"),
2815
- "text": redact.get("contents"),
2816
  "color": redact.get(
2817
  "border-color", "(0, 0, 0)"
2818
  ), # Default to black if not specified
@@ -2824,9 +2855,10 @@ def parse_xfdf(xfdf_path: str):
2824
 
2825
  def convert_xfdf_to_dataframe(
2826
  file_paths_list: List[str],
2827
- pymupdf_doc,
2828
  image_paths: List[str],
2829
  output_folder: str = OUTPUT_FOLDER,
 
2830
  ):
2831
  """
2832
  Convert redaction annotations from XFDF and associated images into a DataFrame.
@@ -2835,12 +2867,16 @@ def convert_xfdf_to_dataframe(
2835
  - xfdf_path: Path to the XFDF file
2836
  - pdf_doc: PyMuPDF document object
2837
  - image_paths: List of PIL Image objects corresponding to PDF pages
 
 
2838
 
2839
  Returns:
2840
  - DataFrame containing redaction information
2841
  """
2842
  output_paths = list()
2843
  df = pd.DataFrame()
 
 
2844
 
2845
  # Sort the file paths so that the pdfs come first
2846
  file_paths_list = sorted(
@@ -2863,6 +2899,7 @@ def convert_xfdf_to_dataframe(
2863
 
2864
  if file_path_end == "pdf":
2865
  pdf_name = os.path.basename(file_path)
 
2866
 
2867
  # Add pdf to outputs
2868
  output_paths.append(file_path)
@@ -2896,7 +2933,18 @@ def convert_xfdf_to_dataframe(
2896
  image_path = image_paths[page_python_format]
2897
 
2898
  if isinstance(image_path, str):
2899
- image = Image.open(image_path)
 
 
 
 
 
 
 
 
 
 
 
2900
 
2901
  image_page_width, image_page_height = image.size
2902
 
@@ -2927,4 +2975,8 @@ def convert_xfdf_to_dataframe(
2927
 
2928
  output_paths.append(out_file_path)
2929
 
 
 
 
 
2930
  return output_paths
 
4
  import uuid
5
  from datetime import datetime, timedelta, timezone
6
  from typing import Dict, List, Set, Tuple
7
+ from xml.etree.ElementTree import Element, SubElement, tostring
8
+
9
+ import defusedxml
10
+ import defusedxml.ElementTree as defused_etree
11
+ import defusedxml.minidom as defused_minidom
12
+
13
+ # Defuse the standard library XML modules for security
14
+ defusedxml.defuse_stdlib()
15
 
16
  import gradio as gr
17
  import numpy as np
 
623
  print(
624
  f"Error during image path replacement for page {gradio_annotator_current_page_number}: {e}"
625
  )
626
+ else:
627
+ print(
628
+ f"Warning: Page index {page_num_reported_zero_indexed} out of bounds for all_image_annotations list."
629
+ )
630
 
631
  # Save back page_sizes_df to page_sizes list format
632
  if not page_sizes_df.empty:
 
2662
  data_element.text = data_content_string
2663
 
2664
  rough_string = tostring(xfdf_root, encoding="unicode", method="xml")
2665
+ reparsed = defused_minidom.parseString(rough_string)
2666
  return reparsed.toxml() # .toprettyxml(indent=" ")
2667
 
2668
 
 
2803
  Returns:
2804
  - List of dictionaries containing redaction information
2805
  """
2806
+ # Assuming xfdf_path is a file path. If you are passing the XML string,
2807
+ # you would use defused_etree.fromstring(xfdf_string) instead of .parse()
2808
+ tree = defused_etree.parse(xfdf_path)
2809
  root = tree.getroot()
2810
 
2811
  # Define the namespace
 
2816
  # Find all redact elements using the namespace
2817
  for redact in root.findall(".//xfdf:redact", namespaces=namespace):
2818
 
2819
+ # Extract text from contents-richtext if it exists
2820
+ text_content = ""
2821
+
2822
+ # *** THE FIX IS HERE ***
2823
+ # Use the namespace to find the contents-richtext element
2824
+ contents_richtext = redact.find(
2825
+ ".//xfdf:contents-richtext", namespaces=namespace
2826
+ )
2827
+
2828
+ if contents_richtext is not None:
2829
+ # Get all text content from the HTML structure
2830
+ # The children of contents-richtext (body, p, span) have a different namespace
2831
+ # but itertext() cleverly handles that for us.
2832
+ text_content = "".join(contents_richtext.itertext()).strip()
2833
+
2834
+ # Fallback to contents attribute if no richtext content
2835
+ if not text_content:
2836
+ text_content = redact.get("contents", "")
2837
+
2838
  redaction_info = {
2839
  "image": "", # Image will be filled in later
2840
  "page": int(redact.get("page")) + 1, # Convert to 1-based index
 
2843
  "xmax": float(redact.get("rect").split(",")[2]),
2844
  "ymax": float(redact.get("rect").split(",")[3]),
2845
  "label": redact.get("title"),
2846
+ "text": text_content, # Use the extracted text content
2847
  "color": redact.get(
2848
  "border-color", "(0, 0, 0)"
2849
  ), # Default to black if not specified
 
2855
 
2856
  def convert_xfdf_to_dataframe(
2857
  file_paths_list: List[str],
2858
+ pymupdf_doc: Document,
2859
  image_paths: List[str],
2860
  output_folder: str = OUTPUT_FOLDER,
2861
+ input_folder: str = INPUT_FOLDER,
2862
  ):
2863
  """
2864
  Convert redaction annotations from XFDF and associated images into a DataFrame.
 
2867
  - xfdf_path: Path to the XFDF file
2868
  - pdf_doc: PyMuPDF document object
2869
  - image_paths: List of PIL Image objects corresponding to PDF pages
2870
+ - output_folder: Output folder for file save
2871
+ - input_folder: Input folder for image creation
2872
 
2873
  Returns:
2874
  - DataFrame containing redaction information
2875
  """
2876
  output_paths = list()
2877
  df = pd.DataFrame()
2878
+ pdf_name = ""
2879
+ pdf_path = ""
2880
 
2881
  # Sort the file paths so that the pdfs come first
2882
  file_paths_list = sorted(
 
2899
 
2900
  if file_path_end == "pdf":
2901
  pdf_name = os.path.basename(file_path)
2902
+ pdf_path = file_path
2903
 
2904
  # Add pdf to outputs
2905
  output_paths.append(file_path)
 
2933
  image_path = image_paths[page_python_format]
2934
 
2935
  if isinstance(image_path, str):
2936
+ try:
2937
+ image = Image.open(image_path)
2938
+ except Exception:
2939
+ # print(f"Error opening image: {e}")
2940
+
2941
+ page_num, out_path, width, height = (
2942
+ process_single_page_for_image_conversion(
2943
+ pdf_path, page_python_format, input_folder=input_folder
2944
+ )
2945
+ )
2946
+
2947
+ image = Image.open(out_path)
2948
 
2949
  image_page_width, image_page_height = image.size
2950
 
 
2975
 
2976
  output_paths.append(out_file_path)
2977
 
2978
+ gr.Info(
2979
+ f"Review file saved to {out_file_path}. Now click on '1. Upload original pdf' to view the pdf with the annotations."
2980
+ )
2981
+
2982
  return output_paths
tools/secure_regex_utils.py CHANGED
@@ -86,7 +86,7 @@ def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
86
 
87
  def safe_extract_page_number_from_path(path: str) -> Optional[int]:
88
  """
89
- Safely extract page number from path containing _(\d+).png pattern.
90
 
91
  Args:
92
  path: The path to extract page number from
 
86
 
87
  def safe_extract_page_number_from_path(path: str) -> Optional[int]:
88
  """
89
+ Safely extract page number from path containing _(\\d+).png pattern.
90
 
91
  Args:
92
  path: The path to extract page number from