Commit
·
1cb1897
1
Parent(s):
96ac47b
Removed some extraneous test steps. Improved Example loading and feedback, and redaction feedback. Minor security updates. Fixed Adobe xfdf file parsing.
Browse files- .coveragerc +56 -0
- .github/workflows/ci.yml +13 -9
- app.py +180 -106
- pyproject.toml +26 -2
- requirements.txt +1 -0
- tools/config.py +1 -0
- tools/data_anonymise.py +3 -0
- tools/file_redaction.py +2 -0
- tools/load_spacy_model_custom_recognisers.py +1 -1
- tools/redaction_review.py +59 -7
- tools/secure_regex_utils.py +1 -1
.coveragerc
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[run]
|
2 |
+
source = .
|
3 |
+
omit =
|
4 |
+
*/tests/*
|
5 |
+
*/test/*
|
6 |
+
*/__pycache__/*
|
7 |
+
*/venv/*
|
8 |
+
*/env/*
|
9 |
+
*/build/*
|
10 |
+
*/dist/*
|
11 |
+
*/cdk/*
|
12 |
+
*/docs/*
|
13 |
+
*/example_data/*
|
14 |
+
*/examples/*
|
15 |
+
*/feedback/*
|
16 |
+
*/logs/*
|
17 |
+
*/old_code/*
|
18 |
+
*/output/*
|
19 |
+
*/tmp/*
|
20 |
+
*/usage/*
|
21 |
+
*/tld/*
|
22 |
+
*/tesseract/*
|
23 |
+
*/poppler/*
|
24 |
+
config*.py
|
25 |
+
setup.py
|
26 |
+
lambda_entrypoint.py
|
27 |
+
entrypoint.sh
|
28 |
+
cli_redact.py
|
29 |
+
load_dynamo_logs.py
|
30 |
+
load_s3_logs.py
|
31 |
+
*.spec
|
32 |
+
Dockerfile
|
33 |
+
*.qmd
|
34 |
+
*.md
|
35 |
+
*.txt
|
36 |
+
*.yml
|
37 |
+
*.yaml
|
38 |
+
*.json
|
39 |
+
*.csv
|
40 |
+
*.env
|
41 |
+
*.bat
|
42 |
+
*.ps1
|
43 |
+
*.sh
|
44 |
+
|
45 |
+
[report]
|
46 |
+
exclude_lines =
|
47 |
+
pragma: no cover
|
48 |
+
def __repr__
|
49 |
+
if self.debug:
|
50 |
+
if settings.DEBUG
|
51 |
+
raise AssertionError
|
52 |
+
raise NotImplementedError
|
53 |
+
if 0:
|
54 |
+
if __name__ == .__main__.:
|
55 |
+
class .*\bProtocol\):
|
56 |
+
@(abc\.)?abstractmethod
|
.github/workflows/ci.yml
CHANGED
@@ -90,6 +90,10 @@ jobs:
|
|
90 |
run: |
|
91 |
python .github/scripts/setup_test_data.py
|
92 |
|
|
|
|
|
|
|
|
|
93 |
- name: Run CLI tests
|
94 |
run: |
|
95 |
cd test
|
@@ -101,16 +105,16 @@ jobs:
|
|
101 |
|
102 |
- name: Run tests with coverage
|
103 |
run: |
|
104 |
-
pytest test/test.py --cov=. --cov-report=xml --cov-report=html --cov-report=term
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
|
115 |
- name: Upload test results
|
116 |
uses: actions/upload-artifact@v4
|
|
|
90 |
run: |
|
91 |
python .github/scripts/setup_test_data.py
|
92 |
|
93 |
+
- name: Clean up problematic config files
|
94 |
+
run: |
|
95 |
+
rm -f config*.py || true
|
96 |
+
|
97 |
- name: Run CLI tests
|
98 |
run: |
|
99 |
cd test
|
|
|
105 |
|
106 |
- name: Run tests with coverage
|
107 |
run: |
|
108 |
+
pytest test/test.py --cov=. --cov-config=.coveragerc --cov-report=xml --cov-report=html --cov-report=term
|
109 |
|
110 |
+
#- name: Upload coverage to Codecov - not necessary
|
111 |
+
# uses: codecov/codecov-action@v3
|
112 |
+
# if: matrix.python-version == '3.11'
|
113 |
+
# with:
|
114 |
+
# file: ./coverage.xml
|
115 |
+
# flags: unittests
|
116 |
+
# name: codecov-umbrella
|
117 |
+
# fail_ci_if_error: false
|
118 |
|
119 |
- name: Upload test results
|
120 |
uses: actions/upload-artifact@v4
|
app.py
CHANGED
@@ -93,6 +93,7 @@ from tools.config import (
|
|
93 |
SAVE_LOGS_TO_CSV,
|
94 |
SAVE_LOGS_TO_DYNAMODB,
|
95 |
SESSION_OUTPUT_FOLDER,
|
|
|
96 |
SHOW_COSTS,
|
97 |
SHOW_EXAMPLES,
|
98 |
SHOW_LANGUAGE_SELECTION,
|
@@ -1008,75 +1009,101 @@ with app:
|
|
1008 |
###
|
1009 |
with gr.Tab("Redact PDFs/images"):
|
1010 |
|
1011 |
-
# Examples for PDF/image redaction
|
1012 |
# Examples for PDF/image redaction
|
1013 |
if SHOW_EXAMPLES == "True":
|
1014 |
gr.Markdown(
|
1015 |
"### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
|
1016 |
)
|
1017 |
-
|
1018 |
# Check which example files exist and create examples only for available files
|
1019 |
example_files = [
|
1020 |
"example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
|
1021 |
-
"example_data/example_complaint_letter.jpg",
|
1022 |
"example_data/graduate-job-example-cover-letter.pdf",
|
1023 |
-
"example_data/Partnership-Agreement-Toolkit_0_0.pdf"
|
1024 |
]
|
1025 |
-
|
1026 |
-
available_examples =
|
1027 |
-
example_labels =
|
1028 |
-
|
1029 |
# Check each example file and add to examples if it exists
|
1030 |
if os.path.exists(example_files[0]):
|
1031 |
-
available_examples.append(
|
1032 |
-
[
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
|
|
|
|
1040 |
example_labels.append("PDF with selectable text redaction")
|
1041 |
-
|
1042 |
if os.path.exists(example_files[1]):
|
1043 |
-
available_examples.append(
|
1044 |
-
[
|
1045 |
-
|
1046 |
-
|
1047 |
-
|
1048 |
-
|
1049 |
-
|
1050 |
-
|
1051 |
-
|
|
|
|
|
1052 |
example_labels.append("Image redaction with local OCR")
|
1053 |
-
|
1054 |
if os.path.exists(example_files[2]):
|
1055 |
-
available_examples.append(
|
1056 |
-
[
|
1057 |
-
|
1058 |
-
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
|
|
|
|
|
|
|
|
1066 |
if os.path.exists(example_files[3]):
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
-
|
1072 |
-
|
1073 |
-
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
|
|
|
|
|
|
|
|
|
|
|
1078 |
# Only create examples if we have available files
|
1079 |
if available_examples:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1080 |
redaction_examples = gr.Examples(
|
1081 |
examples=available_examples,
|
1082 |
inputs=[
|
@@ -1089,6 +1116,8 @@ with app:
|
|
1089 |
prepared_pdf_state,
|
1090 |
],
|
1091 |
example_labels=example_labels,
|
|
|
|
|
1092 |
)
|
1093 |
|
1094 |
with gr.Accordion("Redact document", open=True):
|
@@ -1664,16 +1693,32 @@ with app:
|
|
1664 |
"Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab."
|
1665 |
)
|
1666 |
|
|
|
|
|
|
|
1667 |
# Examples for duplicate page detection
|
1668 |
if SHOW_EXAMPLES == "True":
|
1669 |
gr.Markdown(
|
1670 |
"### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
|
1671 |
)
|
1672 |
-
|
1673 |
# Check if duplicate example file exists
|
1674 |
-
duplicate_example_file =
|
1675 |
-
|
|
|
|
|
1676 |
if os.path.exists(duplicate_example_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1677 |
duplicate_examples = gr.Examples(
|
1678 |
examples=[
|
1679 |
[
|
@@ -1699,6 +1744,8 @@ with app:
|
|
1699 |
"Find duplicate pages of text in document OCR outputs",
|
1700 |
"Find duplicate text lines in document OCR outputs",
|
1701 |
],
|
|
|
|
|
1702 |
)
|
1703 |
|
1704 |
with gr.Accordion("Step 1: Configure and run analysis", open=True):
|
@@ -1821,7 +1868,7 @@ with app:
|
|
1821 |
###
|
1822 |
with gr.Tab(label="Word or Excel/csv files"):
|
1823 |
gr.Markdown(
|
1824 |
-
"""Choose Word or
|
1825 |
)
|
1826 |
|
1827 |
# Examples for Word/Excel/csv redaction and tabular duplicate detection
|
@@ -1829,53 +1876,78 @@ with app:
|
|
1829 |
gr.Markdown(
|
1830 |
"### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
|
1831 |
)
|
1832 |
-
|
1833 |
# Check which tabular example files exist
|
1834 |
tabular_example_files = [
|
1835 |
"example_data/combined_case_notes.csv",
|
1836 |
"example_data/Bold minimalist professional cover letter.docx",
|
1837 |
-
"example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
|
1838 |
]
|
1839 |
-
|
1840 |
-
available_tabular_examples =
|
1841 |
-
tabular_example_labels =
|
1842 |
-
|
1843 |
# Check each tabular example file and add to examples if it exists
|
1844 |
if os.path.exists(tabular_example_files[0]):
|
1845 |
-
available_tabular_examples.append(
|
1846 |
-
[
|
1847 |
-
|
1848 |
-
|
1849 |
-
|
1850 |
-
|
1851 |
-
|
1852 |
-
|
1853 |
-
|
1854 |
-
|
|
|
|
|
|
|
|
|
1855 |
if os.path.exists(tabular_example_files[1]):
|
1856 |
-
available_tabular_examples.append(
|
1857 |
-
[
|
1858 |
-
|
1859 |
-
|
1860 |
-
|
1861 |
-
|
1862 |
-
|
1863 |
-
|
1864 |
-
|
1865 |
-
|
|
|
|
|
|
|
|
|
1866 |
if os.path.exists(tabular_example_files[2]):
|
1867 |
-
available_tabular_examples.append(
|
1868 |
-
[
|
1869 |
-
|
1870 |
-
|
1871 |
-
|
1872 |
-
|
1873 |
-
|
1874 |
-
|
1875 |
-
|
1876 |
-
|
|
|
|
|
|
|
|
|
1877 |
# Only create examples if we have available files
|
1878 |
if available_tabular_examples:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1879 |
tabular_examples = gr.Examples(
|
1880 |
examples=available_tabular_examples,
|
1881 |
inputs=[
|
@@ -1887,6 +1959,8 @@ with app:
|
|
1887 |
tabular_text_columns,
|
1888 |
],
|
1889 |
example_labels=tabular_example_labels,
|
|
|
|
|
1890 |
)
|
1891 |
|
1892 |
with gr.Accordion("Redact Word or Excel/csv files", open=True):
|
@@ -2313,7 +2387,7 @@ with app:
|
|
2313 |
# Recalculate estimated costs based on changes to inputs
|
2314 |
if SHOW_COSTS == "True":
|
2315 |
# Calculate costs
|
2316 |
-
total_pdf_page_count.
|
2317 |
calculate_aws_costs,
|
2318 |
inputs=[
|
2319 |
total_pdf_page_count,
|
@@ -2325,7 +2399,7 @@ with app:
|
|
2325 |
],
|
2326 |
outputs=[estimated_aws_costs_number],
|
2327 |
)
|
2328 |
-
text_extract_method_radio.
|
2329 |
fn=check_for_relevant_ocr_output_with_words,
|
2330 |
inputs=[
|
2331 |
doc_file_name_no_extension_textbox,
|
@@ -2345,7 +2419,7 @@ with app:
|
|
2345 |
],
|
2346 |
outputs=[estimated_aws_costs_number],
|
2347 |
)
|
2348 |
-
pii_identification_method_drop.
|
2349 |
calculate_aws_costs,
|
2350 |
inputs=[
|
2351 |
total_pdf_page_count,
|
@@ -2357,7 +2431,7 @@ with app:
|
|
2357 |
],
|
2358 |
outputs=[estimated_aws_costs_number],
|
2359 |
)
|
2360 |
-
handwrite_signature_checkbox.
|
2361 |
calculate_aws_costs,
|
2362 |
inputs=[
|
2363 |
total_pdf_page_count,
|
@@ -2369,7 +2443,7 @@ with app:
|
|
2369 |
],
|
2370 |
outputs=[estimated_aws_costs_number],
|
2371 |
)
|
2372 |
-
textract_output_found_checkbox.
|
2373 |
calculate_aws_costs,
|
2374 |
inputs=[
|
2375 |
total_pdf_page_count,
|
@@ -2381,7 +2455,7 @@ with app:
|
|
2381 |
],
|
2382 |
outputs=[estimated_aws_costs_number],
|
2383 |
)
|
2384 |
-
only_extract_text_radio.
|
2385 |
calculate_aws_costs,
|
2386 |
inputs=[
|
2387 |
total_pdf_page_count,
|
@@ -2393,7 +2467,7 @@ with app:
|
|
2393 |
],
|
2394 |
outputs=[estimated_aws_costs_number],
|
2395 |
)
|
2396 |
-
textract_output_found_checkbox.
|
2397 |
calculate_aws_costs,
|
2398 |
inputs=[
|
2399 |
total_pdf_page_count,
|
@@ -2407,7 +2481,7 @@ with app:
|
|
2407 |
)
|
2408 |
|
2409 |
# Calculate time taken
|
2410 |
-
total_pdf_page_count.
|
2411 |
calculate_time_taken,
|
2412 |
inputs=[
|
2413 |
total_pdf_page_count,
|
@@ -2419,7 +2493,7 @@ with app:
|
|
2419 |
],
|
2420 |
outputs=[estimated_time_taken_number],
|
2421 |
)
|
2422 |
-
text_extract_method_radio.
|
2423 |
calculate_time_taken,
|
2424 |
inputs=[
|
2425 |
total_pdf_page_count,
|
@@ -2431,7 +2505,7 @@ with app:
|
|
2431 |
],
|
2432 |
outputs=[estimated_time_taken_number],
|
2433 |
)
|
2434 |
-
pii_identification_method_drop.
|
2435 |
calculate_time_taken,
|
2436 |
inputs=[
|
2437 |
total_pdf_page_count,
|
@@ -2443,7 +2517,7 @@ with app:
|
|
2443 |
],
|
2444 |
outputs=[estimated_time_taken_number],
|
2445 |
)
|
2446 |
-
handwrite_signature_checkbox.
|
2447 |
calculate_time_taken,
|
2448 |
inputs=[
|
2449 |
total_pdf_page_count,
|
@@ -2455,7 +2529,7 @@ with app:
|
|
2455 |
],
|
2456 |
outputs=[estimated_time_taken_number],
|
2457 |
)
|
2458 |
-
textract_output_found_checkbox.
|
2459 |
calculate_time_taken,
|
2460 |
inputs=[
|
2461 |
total_pdf_page_count,
|
@@ -2468,7 +2542,7 @@ with app:
|
|
2468 |
],
|
2469 |
outputs=[estimated_time_taken_number],
|
2470 |
)
|
2471 |
-
only_extract_text_radio.
|
2472 |
calculate_time_taken,
|
2473 |
inputs=[
|
2474 |
total_pdf_page_count,
|
@@ -2480,7 +2554,7 @@ with app:
|
|
2480 |
],
|
2481 |
outputs=[estimated_time_taken_number],
|
2482 |
)
|
2483 |
-
textract_output_found_checkbox.
|
2484 |
calculate_time_taken,
|
2485 |
inputs=[
|
2486 |
total_pdf_page_count,
|
@@ -2492,7 +2566,7 @@ with app:
|
|
2492 |
],
|
2493 |
outputs=[estimated_time_taken_number],
|
2494 |
)
|
2495 |
-
relevant_ocr_output_with_words_found_checkbox.
|
2496 |
calculate_time_taken,
|
2497 |
inputs=[
|
2498 |
total_pdf_page_count,
|
@@ -5190,6 +5264,7 @@ with app:
|
|
5190 |
pdf_doc_state,
|
5191 |
images_pdf_state,
|
5192 |
output_folder_textbox,
|
|
|
5193 |
],
|
5194 |
outputs=[input_pdf_for_review],
|
5195 |
scroll_to_output=True,
|
@@ -6423,7 +6498,6 @@ if __name__ == "__main__":
|
|
6423 |
# Run the CLI main function with direct mode arguments
|
6424 |
main(direct_mode_args=direct_mode_args)
|
6425 |
|
6426 |
-
|
6427 |
# Combine extraction options
|
6428 |
extraction_options = (
|
6429 |
list(direct_mode_args["handwrite_signature_extraction"])
|
|
|
93 |
SAVE_LOGS_TO_CSV,
|
94 |
SAVE_LOGS_TO_DYNAMODB,
|
95 |
SESSION_OUTPUT_FOLDER,
|
96 |
+
SHOW_AWS_EXAMPLES,
|
97 |
SHOW_COSTS,
|
98 |
SHOW_EXAMPLES,
|
99 |
SHOW_LANGUAGE_SELECTION,
|
|
|
1009 |
###
|
1010 |
with gr.Tab("Redact PDFs/images"):
|
1011 |
|
|
|
1012 |
# Examples for PDF/image redaction
|
1013 |
if SHOW_EXAMPLES == "True":
|
1014 |
gr.Markdown(
|
1015 |
"### Try an example - Click on an example below and then the 'Extract text and redact document' button:"
|
1016 |
)
|
1017 |
+
|
1018 |
# Check which example files exist and create examples only for available files
|
1019 |
example_files = [
|
1020 |
"example_data/example_of_emails_sent_to_a_professor_before_applying.pdf",
|
1021 |
+
"example_data/example_complaint_letter.jpg",
|
1022 |
"example_data/graduate-job-example-cover-letter.pdf",
|
1023 |
+
"example_data/Partnership-Agreement-Toolkit_0_0.pdf",
|
1024 |
]
|
1025 |
+
|
1026 |
+
available_examples = list()
|
1027 |
+
example_labels = list()
|
1028 |
+
|
1029 |
# Check each example file and add to examples if it exists
|
1030 |
if os.path.exists(example_files[0]):
|
1031 |
+
available_examples.append(
|
1032 |
+
[
|
1033 |
+
[example_files[0]],
|
1034 |
+
"Local model - selectable text",
|
1035 |
+
"Local",
|
1036 |
+
[],
|
1037 |
+
CHOSEN_REDACT_ENTITIES,
|
1038 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1039 |
+
[example_files[0]],
|
1040 |
+
]
|
1041 |
+
)
|
1042 |
example_labels.append("PDF with selectable text redaction")
|
1043 |
+
|
1044 |
if os.path.exists(example_files[1]):
|
1045 |
+
available_examples.append(
|
1046 |
+
[
|
1047 |
+
[example_files[1]],
|
1048 |
+
"Local OCR model - PDFs without selectable text",
|
1049 |
+
"Local",
|
1050 |
+
[],
|
1051 |
+
CHOSEN_REDACT_ENTITIES,
|
1052 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1053 |
+
[example_files[1]],
|
1054 |
+
]
|
1055 |
+
)
|
1056 |
example_labels.append("Image redaction with local OCR")
|
1057 |
+
|
1058 |
if os.path.exists(example_files[2]):
|
1059 |
+
available_examples.append(
|
1060 |
+
[
|
1061 |
+
[example_files[2]],
|
1062 |
+
"Local OCR model - PDFs without selectable text",
|
1063 |
+
"Local",
|
1064 |
+
[],
|
1065 |
+
["TITLES", "PERSON", "DATE_TIME"],
|
1066 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1067 |
+
[example_files[2]],
|
1068 |
+
]
|
1069 |
+
)
|
1070 |
+
example_labels.append(
|
1071 |
+
"PDF redaction with custom entities (Titles, Person, Dates)"
|
1072 |
+
)
|
1073 |
+
|
1074 |
if os.path.exists(example_files[3]):
|
1075 |
+
if SHOW_AWS_EXAMPLES == "True":
|
1076 |
+
available_examples.append(
|
1077 |
+
[
|
1078 |
+
[example_files[3]],
|
1079 |
+
"AWS Textract service - all PDF types",
|
1080 |
+
"AWS Comprehend",
|
1081 |
+
["Extract handwriting", "Extract signatures"],
|
1082 |
+
CHOSEN_REDACT_ENTITIES,
|
1083 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
1084 |
+
[example_files[3]],
|
1085 |
+
]
|
1086 |
+
)
|
1087 |
+
example_labels.append(
|
1088 |
+
"PDF redaction with AWS services and signature detection"
|
1089 |
+
)
|
1090 |
+
|
1091 |
# Only create examples if we have available files
|
1092 |
if available_examples:
|
1093 |
+
|
1094 |
+
def show_info_box_on_click(
|
1095 |
+
in_doc_files,
|
1096 |
+
text_extract_method_radio,
|
1097 |
+
pii_identification_method_drop,
|
1098 |
+
handwrite_signature_checkbox,
|
1099 |
+
in_redact_entities,
|
1100 |
+
in_redact_comprehend_entities,
|
1101 |
+
prepared_pdf_state,
|
1102 |
+
):
|
1103 |
+
gr.Info(
|
1104 |
+
"Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
|
1105 |
+
)
|
1106 |
+
|
1107 |
redaction_examples = gr.Examples(
|
1108 |
examples=available_examples,
|
1109 |
inputs=[
|
|
|
1116 |
prepared_pdf_state,
|
1117 |
],
|
1118 |
example_labels=example_labels,
|
1119 |
+
fn=show_info_box_on_click,
|
1120 |
+
run_on_click=True,
|
1121 |
)
|
1122 |
|
1123 |
with gr.Accordion("Redact document", open=True):
|
|
|
1693 |
"Search for duplicate pages/subdocuments in your ocr_output files. By default, this function will search for duplicate text across multiple pages, and then join consecutive matching pages together into matched 'subdocuments'. The results can be reviewed below, false positives removed, and then the verified results applied to a document you have loaded in on the 'Review redactions' tab."
|
1694 |
)
|
1695 |
|
1696 |
+
# Examples for duplicate page detection
|
1697 |
+
# ... existing code ...
|
1698 |
+
|
1699 |
# Examples for duplicate page detection
|
1700 |
if SHOW_EXAMPLES == "True":
|
1701 |
gr.Markdown(
|
1702 |
"### Try an example - Click on an example below and then the 'Identify duplicate pages/subdocuments' button:"
|
1703 |
)
|
1704 |
+
|
1705 |
# Check if duplicate example file exists
|
1706 |
+
duplicate_example_file = (
|
1707 |
+
"example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv"
|
1708 |
+
)
|
1709 |
+
|
1710 |
if os.path.exists(duplicate_example_file):
|
1711 |
+
|
1712 |
+
def show_duplicate_info_box_on_click(
|
1713 |
+
in_duplicate_pages,
|
1714 |
+
duplicate_threshold_input,
|
1715 |
+
min_word_count_input,
|
1716 |
+
combine_page_text_for_duplicates_bool,
|
1717 |
+
):
|
1718 |
+
gr.Info(
|
1719 |
+
"Example data loaded. Now click on 'Identify duplicate pages/subdocuments' below to run the example duplicate detection."
|
1720 |
+
)
|
1721 |
+
|
1722 |
duplicate_examples = gr.Examples(
|
1723 |
examples=[
|
1724 |
[
|
|
|
1744 |
"Find duplicate pages of text in document OCR outputs",
|
1745 |
"Find duplicate text lines in document OCR outputs",
|
1746 |
],
|
1747 |
+
fn=show_duplicate_info_box_on_click,
|
1748 |
+
run_on_click=True,
|
1749 |
)
|
1750 |
|
1751 |
with gr.Accordion("Step 1: Configure and run analysis", open=True):
|
|
|
1868 |
###
|
1869 |
with gr.Tab(label="Word or Excel/csv files"):
|
1870 |
gr.Markdown(
|
1871 |
+
"""Choose a Word or tabular data file (xlsx or csv) to redact. Note that when redacting complex Word files with e.g. images, some content/formatting will be removed, and it may not attempt to redact headers. You may prefer to convert the doc file to PDF in Word, and then run it through the first tab of this app (Print to PDF in print settings). Alternatively, an xlsx file output is provided when redacting docx files directly to allow for copying and pasting outputs back into the original document if preferred."""
|
1872 |
)
|
1873 |
|
1874 |
# Examples for Word/Excel/csv redaction and tabular duplicate detection
|
|
|
1876 |
gr.Markdown(
|
1877 |
"### Try an example - Click on an example below and then the 'Redact text/data files' button for redaction, or the 'Find duplicate cells/rows' button for duplicate detection:"
|
1878 |
)
|
1879 |
+
|
1880 |
# Check which tabular example files exist
|
1881 |
tabular_example_files = [
|
1882 |
"example_data/combined_case_notes.csv",
|
1883 |
"example_data/Bold minimalist professional cover letter.docx",
|
1884 |
+
"example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv",
|
1885 |
]
|
1886 |
+
|
1887 |
+
available_tabular_examples = list()
|
1888 |
+
tabular_example_labels = list()
|
1889 |
+
|
1890 |
# Check each tabular example file and add to examples if it exists
|
1891 |
if os.path.exists(tabular_example_files[0]):
|
1892 |
+
available_tabular_examples.append(
|
1893 |
+
[
|
1894 |
+
[tabular_example_files[0]],
|
1895 |
+
["Case Note", "Client"],
|
1896 |
+
"Local",
|
1897 |
+
"replace with 'REDACTED'",
|
1898 |
+
[tabular_example_files[0]],
|
1899 |
+
["Case Note"],
|
1900 |
+
]
|
1901 |
+
)
|
1902 |
+
tabular_example_labels.append(
|
1903 |
+
"CSV file redaction with specific columns - remove text"
|
1904 |
+
)
|
1905 |
+
|
1906 |
if os.path.exists(tabular_example_files[1]):
|
1907 |
+
available_tabular_examples.append(
|
1908 |
+
[
|
1909 |
+
[tabular_example_files[1]],
|
1910 |
+
[],
|
1911 |
+
"Local",
|
1912 |
+
"replace with 'REDACTED'",
|
1913 |
+
[],
|
1914 |
+
[],
|
1915 |
+
]
|
1916 |
+
)
|
1917 |
+
tabular_example_labels.append(
|
1918 |
+
"Word document redaction - replace with REDACTED"
|
1919 |
+
)
|
1920 |
+
|
1921 |
if os.path.exists(tabular_example_files[2]):
|
1922 |
+
available_tabular_examples.append(
|
1923 |
+
[
|
1924 |
+
[tabular_example_files[2]],
|
1925 |
+
["text"],
|
1926 |
+
"Local",
|
1927 |
+
"replace with 'REDACTED'",
|
1928 |
+
[tabular_example_files[2]],
|
1929 |
+
["text"],
|
1930 |
+
]
|
1931 |
+
)
|
1932 |
+
tabular_example_labels.append(
|
1933 |
+
"Tabular duplicate detection in CSV files"
|
1934 |
+
)
|
1935 |
+
|
1936 |
# Only create examples if we have available files
|
1937 |
if available_tabular_examples:
|
1938 |
+
|
1939 |
+
def show_tabular_info_box_on_click(
|
1940 |
+
in_data_files,
|
1941 |
+
in_colnames,
|
1942 |
+
pii_identification_method_drop_tabular,
|
1943 |
+
anon_strategy,
|
1944 |
+
in_tabular_duplicate_files,
|
1945 |
+
tabular_text_columns,
|
1946 |
+
):
|
1947 |
+
gr.Info(
|
1948 |
+
"Example data loaded. Now click on 'Redact text/data files' or 'Find duplicate cells/rows' below to run the example."
|
1949 |
+
)
|
1950 |
+
|
1951 |
tabular_examples = gr.Examples(
|
1952 |
examples=available_tabular_examples,
|
1953 |
inputs=[
|
|
|
1959 |
tabular_text_columns,
|
1960 |
],
|
1961 |
example_labels=tabular_example_labels,
|
1962 |
+
fn=show_tabular_info_box_on_click,
|
1963 |
+
run_on_click=True,
|
1964 |
)
|
1965 |
|
1966 |
with gr.Accordion("Redact Word or Excel/csv files", open=True):
|
|
|
2387 |
# Recalculate estimated costs based on changes to inputs
|
2388 |
if SHOW_COSTS == "True":
|
2389 |
# Calculate costs
|
2390 |
+
total_pdf_page_count.input(
|
2391 |
calculate_aws_costs,
|
2392 |
inputs=[
|
2393 |
total_pdf_page_count,
|
|
|
2399 |
],
|
2400 |
outputs=[estimated_aws_costs_number],
|
2401 |
)
|
2402 |
+
text_extract_method_radio.input(
|
2403 |
fn=check_for_relevant_ocr_output_with_words,
|
2404 |
inputs=[
|
2405 |
doc_file_name_no_extension_textbox,
|
|
|
2419 |
],
|
2420 |
outputs=[estimated_aws_costs_number],
|
2421 |
)
|
2422 |
+
pii_identification_method_drop.input(
|
2423 |
calculate_aws_costs,
|
2424 |
inputs=[
|
2425 |
total_pdf_page_count,
|
|
|
2431 |
],
|
2432 |
outputs=[estimated_aws_costs_number],
|
2433 |
)
|
2434 |
+
handwrite_signature_checkbox.input(
|
2435 |
calculate_aws_costs,
|
2436 |
inputs=[
|
2437 |
total_pdf_page_count,
|
|
|
2443 |
],
|
2444 |
outputs=[estimated_aws_costs_number],
|
2445 |
)
|
2446 |
+
textract_output_found_checkbox.input(
|
2447 |
calculate_aws_costs,
|
2448 |
inputs=[
|
2449 |
total_pdf_page_count,
|
|
|
2455 |
],
|
2456 |
outputs=[estimated_aws_costs_number],
|
2457 |
)
|
2458 |
+
only_extract_text_radio.input(
|
2459 |
calculate_aws_costs,
|
2460 |
inputs=[
|
2461 |
total_pdf_page_count,
|
|
|
2467 |
],
|
2468 |
outputs=[estimated_aws_costs_number],
|
2469 |
)
|
2470 |
+
textract_output_found_checkbox.input(
|
2471 |
calculate_aws_costs,
|
2472 |
inputs=[
|
2473 |
total_pdf_page_count,
|
|
|
2481 |
)
|
2482 |
|
2483 |
# Calculate time taken
|
2484 |
+
total_pdf_page_count.input(
|
2485 |
calculate_time_taken,
|
2486 |
inputs=[
|
2487 |
total_pdf_page_count,
|
|
|
2493 |
],
|
2494 |
outputs=[estimated_time_taken_number],
|
2495 |
)
|
2496 |
+
text_extract_method_radio.input(
|
2497 |
calculate_time_taken,
|
2498 |
inputs=[
|
2499 |
total_pdf_page_count,
|
|
|
2505 |
],
|
2506 |
outputs=[estimated_time_taken_number],
|
2507 |
)
|
2508 |
+
pii_identification_method_drop.input(
|
2509 |
calculate_time_taken,
|
2510 |
inputs=[
|
2511 |
total_pdf_page_count,
|
|
|
2517 |
],
|
2518 |
outputs=[estimated_time_taken_number],
|
2519 |
)
|
2520 |
+
handwrite_signature_checkbox.input(
|
2521 |
calculate_time_taken,
|
2522 |
inputs=[
|
2523 |
total_pdf_page_count,
|
|
|
2529 |
],
|
2530 |
outputs=[estimated_time_taken_number],
|
2531 |
)
|
2532 |
+
textract_output_found_checkbox.input(
|
2533 |
calculate_time_taken,
|
2534 |
inputs=[
|
2535 |
total_pdf_page_count,
|
|
|
2542 |
],
|
2543 |
outputs=[estimated_time_taken_number],
|
2544 |
)
|
2545 |
+
only_extract_text_radio.input(
|
2546 |
calculate_time_taken,
|
2547 |
inputs=[
|
2548 |
total_pdf_page_count,
|
|
|
2554 |
],
|
2555 |
outputs=[estimated_time_taken_number],
|
2556 |
)
|
2557 |
+
textract_output_found_checkbox.input(
|
2558 |
calculate_time_taken,
|
2559 |
inputs=[
|
2560 |
total_pdf_page_count,
|
|
|
2566 |
],
|
2567 |
outputs=[estimated_time_taken_number],
|
2568 |
)
|
2569 |
+
relevant_ocr_output_with_words_found_checkbox.input(
|
2570 |
calculate_time_taken,
|
2571 |
inputs=[
|
2572 |
total_pdf_page_count,
|
|
|
5264 |
pdf_doc_state,
|
5265 |
images_pdf_state,
|
5266 |
output_folder_textbox,
|
5267 |
+
input_folder_textbox,
|
5268 |
],
|
5269 |
outputs=[input_pdf_for_review],
|
5270 |
scroll_to_output=True,
|
|
|
6498 |
# Run the CLI main function with direct mode arguments
|
6499 |
main(direct_mode_args=direct_mode_args)
|
6500 |
|
|
|
6501 |
# Combine extraction options
|
6502 |
extraction_options = (
|
6503 |
list(direct_mode_args["handwrite_signature_extraction"])
|
pyproject.toml
CHANGED
@@ -36,7 +36,8 @@ dependencies = [
|
|
36 |
"python-dotenv==1.0.1",
|
37 |
"awslambdaric==3.1.1",
|
38 |
"python-docx==1.2.0",
|
39 |
-
"polars==1.33.1"
|
|
|
40 |
#"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
|
41 |
#"paddleocr==3.2.0"
|
42 |
]
|
@@ -66,4 +67,27 @@ ignore = [
|
|
66 |
# Configuration for a Black formatter:
|
67 |
[tool.black]
|
68 |
line-length = 88
|
69 |
-
target-version = ['py310']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
"python-dotenv==1.0.1",
|
37 |
"awslambdaric==3.1.1",
|
38 |
"python-docx==1.2.0",
|
39 |
+
"polars==1.33.1",
|
40 |
+
"defusedxml==0.7.1",
|
41 |
#"paddlepaddle==3.2.0", # Optional paddle imports - only if you want to use hybrid OCR mode with tesseract and paddleOCR
|
42 |
#"paddleocr==3.2.0"
|
43 |
]
|
|
|
67 |
# Configuration for a Black formatter:
|
68 |
[tool.black]
|
69 |
line-length = 88
|
70 |
+
target-version = ['py310']
|
71 |
+
|
72 |
+
# Configuration for pytest:
|
73 |
+
[tool.pytest.ini_options]
|
74 |
+
filterwarnings = [
|
75 |
+
"ignore::DeprecationWarning:click.parser",
|
76 |
+
"ignore::DeprecationWarning:weasel.util.config",
|
77 |
+
"ignore::DeprecationWarning:builtin type",
|
78 |
+
"ignore::DeprecationWarning:websockets.legacy",
|
79 |
+
"ignore::DeprecationWarning:websockets.server",
|
80 |
+
"ignore::DeprecationWarning:spacy.cli._util",
|
81 |
+
"ignore::DeprecationWarning:weasel.util.config",
|
82 |
+
"ignore::DeprecationWarning:importlib._bootstrap",
|
83 |
+
]
|
84 |
+
testpaths = ["test"]
|
85 |
+
python_files = ["test_*.py", "*_test.py"]
|
86 |
+
python_classes = ["Test*"]
|
87 |
+
python_functions = ["test_*"]
|
88 |
+
addopts = [
|
89 |
+
"-v",
|
90 |
+
"--tb=short",
|
91 |
+
"--strict-markers",
|
92 |
+
"--disable-warnings",
|
93 |
+
]
|
requirements.txt
CHANGED
@@ -23,6 +23,7 @@ rapidfuzz==3.14.1
|
|
23 |
python-dotenv==1.0.1
|
24 |
awslambdaric==3.1.1
|
25 |
python-docx==1.2.0
|
|
|
26 |
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
27 |
# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
28 |
# paddleocr==3.2.0
|
|
|
23 |
python-dotenv==1.0.1
|
24 |
awslambdaric==3.1.1
|
25 |
python-docx==1.2.0
|
26 |
+
defusedxml==0.7.1
|
27 |
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
28 |
# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
29 |
# paddleocr==3.2.0
|
tools/config.py
CHANGED
@@ -544,6 +544,7 @@ except Exception as e:
|
|
544 |
COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
|
545 |
|
546 |
SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
|
|
|
547 |
|
548 |
RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
|
549 |
|
|
|
544 |
COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
|
545 |
|
546 |
SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "False")
|
547 |
+
SHOW_AWS_EXAMPLES = get_or_create_env_var("SHOW_AWS_EXAMPLES", "False")
|
548 |
|
549 |
RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
|
550 |
|
tools/data_anonymise.py
CHANGED
@@ -588,6 +588,9 @@ def anonymise_files_with_open_text(
|
|
588 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
589 |
# latest_file_completed = 99
|
590 |
final_out_message = "\n".join(out_message)
|
|
|
|
|
|
|
591 |
return (
|
592 |
final_out_message,
|
593 |
out_file_paths,
|
|
|
588 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
589 |
# latest_file_completed = 99
|
590 |
final_out_message = "\n".join(out_message)
|
591 |
+
|
592 |
+
gr.Info(final_out_message)
|
593 |
+
|
594 |
return (
|
595 |
final_out_message,
|
596 |
out_file_paths,
|
tools/file_redaction.py
CHANGED
@@ -470,6 +470,8 @@ def choose_and_run_redactor(
|
|
470 |
)
|
471 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
472 |
|
|
|
|
|
473 |
page_break_return = True
|
474 |
|
475 |
return (
|
|
|
470 |
)
|
471 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
472 |
|
473 |
+
gr.Info(combined_out_message)
|
474 |
+
|
475 |
page_break_return = True
|
476 |
|
477 |
return (
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -277,7 +277,7 @@ def download_tesseract_lang_pack(
|
|
277 |
|
278 |
# Download the file
|
279 |
try:
|
280 |
-
response = requests.get(url, stream=True)
|
281 |
response.raise_for_status() # Raise an exception for bad status codes
|
282 |
|
283 |
with open(file_path, "wb") as f:
|
|
|
277 |
|
278 |
# Download the file
|
279 |
try:
|
280 |
+
response = requests.get(url, stream=True, timeout=60)
|
281 |
response.raise_for_status() # Raise an exception for bad status codes
|
282 |
|
283 |
with open(file_path, "wb") as f:
|
tools/redaction_review.py
CHANGED
@@ -4,8 +4,14 @@ import string
|
|
4 |
import uuid
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
from typing import Dict, List, Set, Tuple
|
7 |
-
from xml.
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
import gradio as gr
|
11 |
import numpy as np
|
@@ -617,6 +623,10 @@ def update_annotator_page_from_review_df(
|
|
617 |
print(
|
618 |
f"Error during image path replacement for page {gradio_annotator_current_page_number}: {e}"
|
619 |
)
|
|
|
|
|
|
|
|
|
620 |
|
621 |
# Save back page_sizes_df to page_sizes list format
|
622 |
if not page_sizes_df.empty:
|
@@ -2652,7 +2662,7 @@ def create_xfdf(
|
|
2652 |
data_element.text = data_content_string
|
2653 |
|
2654 |
rough_string = tostring(xfdf_root, encoding="unicode", method="xml")
|
2655 |
-
reparsed =
|
2656 |
return reparsed.toxml() # .toprettyxml(indent=" ")
|
2657 |
|
2658 |
|
@@ -2793,7 +2803,9 @@ def parse_xfdf(xfdf_path: str):
|
|
2793 |
Returns:
|
2794 |
- List of dictionaries containing redaction information
|
2795 |
"""
|
2796 |
-
|
|
|
|
|
2797 |
root = tree.getroot()
|
2798 |
|
2799 |
# Define the namespace
|
@@ -2804,6 +2816,25 @@ def parse_xfdf(xfdf_path: str):
|
|
2804 |
# Find all redact elements using the namespace
|
2805 |
for redact in root.findall(".//xfdf:redact", namespaces=namespace):
|
2806 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2807 |
redaction_info = {
|
2808 |
"image": "", # Image will be filled in later
|
2809 |
"page": int(redact.get("page")) + 1, # Convert to 1-based index
|
@@ -2812,7 +2843,7 @@ def parse_xfdf(xfdf_path: str):
|
|
2812 |
"xmax": float(redact.get("rect").split(",")[2]),
|
2813 |
"ymax": float(redact.get("rect").split(",")[3]),
|
2814 |
"label": redact.get("title"),
|
2815 |
-
"text":
|
2816 |
"color": redact.get(
|
2817 |
"border-color", "(0, 0, 0)"
|
2818 |
), # Default to black if not specified
|
@@ -2824,9 +2855,10 @@ def parse_xfdf(xfdf_path: str):
|
|
2824 |
|
2825 |
def convert_xfdf_to_dataframe(
|
2826 |
file_paths_list: List[str],
|
2827 |
-
pymupdf_doc,
|
2828 |
image_paths: List[str],
|
2829 |
output_folder: str = OUTPUT_FOLDER,
|
|
|
2830 |
):
|
2831 |
"""
|
2832 |
Convert redaction annotations from XFDF and associated images into a DataFrame.
|
@@ -2835,12 +2867,16 @@ def convert_xfdf_to_dataframe(
|
|
2835 |
- xfdf_path: Path to the XFDF file
|
2836 |
- pdf_doc: PyMuPDF document object
|
2837 |
- image_paths: List of PIL Image objects corresponding to PDF pages
|
|
|
|
|
2838 |
|
2839 |
Returns:
|
2840 |
- DataFrame containing redaction information
|
2841 |
"""
|
2842 |
output_paths = list()
|
2843 |
df = pd.DataFrame()
|
|
|
|
|
2844 |
|
2845 |
# Sort the file paths so that the pdfs come first
|
2846 |
file_paths_list = sorted(
|
@@ -2863,6 +2899,7 @@ def convert_xfdf_to_dataframe(
|
|
2863 |
|
2864 |
if file_path_end == "pdf":
|
2865 |
pdf_name = os.path.basename(file_path)
|
|
|
2866 |
|
2867 |
# Add pdf to outputs
|
2868 |
output_paths.append(file_path)
|
@@ -2896,7 +2933,18 @@ def convert_xfdf_to_dataframe(
|
|
2896 |
image_path = image_paths[page_python_format]
|
2897 |
|
2898 |
if isinstance(image_path, str):
|
2899 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2900 |
|
2901 |
image_page_width, image_page_height = image.size
|
2902 |
|
@@ -2927,4 +2975,8 @@ def convert_xfdf_to_dataframe(
|
|
2927 |
|
2928 |
output_paths.append(out_file_path)
|
2929 |
|
|
|
|
|
|
|
|
|
2930 |
return output_paths
|
|
|
4 |
import uuid
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
from typing import Dict, List, Set, Tuple
|
7 |
+
from xml.etree.ElementTree import Element, SubElement, tostring
|
8 |
+
|
9 |
+
import defusedxml
|
10 |
+
import defusedxml.ElementTree as defused_etree
|
11 |
+
import defusedxml.minidom as defused_minidom
|
12 |
+
|
13 |
+
# Defuse the standard library XML modules for security
|
14 |
+
defusedxml.defuse_stdlib()
|
15 |
|
16 |
import gradio as gr
|
17 |
import numpy as np
|
|
|
623 |
print(
|
624 |
f"Error during image path replacement for page {gradio_annotator_current_page_number}: {e}"
|
625 |
)
|
626 |
+
else:
|
627 |
+
print(
|
628 |
+
f"Warning: Page index {page_num_reported_zero_indexed} out of bounds for all_image_annotations list."
|
629 |
+
)
|
630 |
|
631 |
# Save back page_sizes_df to page_sizes list format
|
632 |
if not page_sizes_df.empty:
|
|
|
2662 |
data_element.text = data_content_string
|
2663 |
|
2664 |
rough_string = tostring(xfdf_root, encoding="unicode", method="xml")
|
2665 |
+
reparsed = defused_minidom.parseString(rough_string)
|
2666 |
return reparsed.toxml() # .toprettyxml(indent=" ")
|
2667 |
|
2668 |
|
|
|
2803 |
Returns:
|
2804 |
- List of dictionaries containing redaction information
|
2805 |
"""
|
2806 |
+
# Assuming xfdf_path is a file path. If you are passing the XML string,
|
2807 |
+
# you would use defused_etree.fromstring(xfdf_string) instead of .parse()
|
2808 |
+
tree = defused_etree.parse(xfdf_path)
|
2809 |
root = tree.getroot()
|
2810 |
|
2811 |
# Define the namespace
|
|
|
2816 |
# Find all redact elements using the namespace
|
2817 |
for redact in root.findall(".//xfdf:redact", namespaces=namespace):
|
2818 |
|
2819 |
+
# Extract text from contents-richtext if it exists
|
2820 |
+
text_content = ""
|
2821 |
+
|
2822 |
+
# *** THE FIX IS HERE ***
|
2823 |
+
# Use the namespace to find the contents-richtext element
|
2824 |
+
contents_richtext = redact.find(
|
2825 |
+
".//xfdf:contents-richtext", namespaces=namespace
|
2826 |
+
)
|
2827 |
+
|
2828 |
+
if contents_richtext is not None:
|
2829 |
+
# Get all text content from the HTML structure
|
2830 |
+
# The children of contents-richtext (body, p, span) have a different namespace
|
2831 |
+
# but itertext() cleverly handles that for us.
|
2832 |
+
text_content = "".join(contents_richtext.itertext()).strip()
|
2833 |
+
|
2834 |
+
# Fallback to contents attribute if no richtext content
|
2835 |
+
if not text_content:
|
2836 |
+
text_content = redact.get("contents", "")
|
2837 |
+
|
2838 |
redaction_info = {
|
2839 |
"image": "", # Image will be filled in later
|
2840 |
"page": int(redact.get("page")) + 1, # Convert to 1-based index
|
|
|
2843 |
"xmax": float(redact.get("rect").split(",")[2]),
|
2844 |
"ymax": float(redact.get("rect").split(",")[3]),
|
2845 |
"label": redact.get("title"),
|
2846 |
+
"text": text_content, # Use the extracted text content
|
2847 |
"color": redact.get(
|
2848 |
"border-color", "(0, 0, 0)"
|
2849 |
), # Default to black if not specified
|
|
|
2855 |
|
2856 |
def convert_xfdf_to_dataframe(
|
2857 |
file_paths_list: List[str],
|
2858 |
+
pymupdf_doc: Document,
|
2859 |
image_paths: List[str],
|
2860 |
output_folder: str = OUTPUT_FOLDER,
|
2861 |
+
input_folder: str = INPUT_FOLDER,
|
2862 |
):
|
2863 |
"""
|
2864 |
Convert redaction annotations from XFDF and associated images into a DataFrame.
|
|
|
2867 |
- xfdf_path: Path to the XFDF file
|
2868 |
- pdf_doc: PyMuPDF document object
|
2869 |
- image_paths: List of PIL Image objects corresponding to PDF pages
|
2870 |
+
- output_folder: Output folder for file save
|
2871 |
+
- input_folder: Input folder for image creation
|
2872 |
|
2873 |
Returns:
|
2874 |
- DataFrame containing redaction information
|
2875 |
"""
|
2876 |
output_paths = list()
|
2877 |
df = pd.DataFrame()
|
2878 |
+
pdf_name = ""
|
2879 |
+
pdf_path = ""
|
2880 |
|
2881 |
# Sort the file paths so that the pdfs come first
|
2882 |
file_paths_list = sorted(
|
|
|
2899 |
|
2900 |
if file_path_end == "pdf":
|
2901 |
pdf_name = os.path.basename(file_path)
|
2902 |
+
pdf_path = file_path
|
2903 |
|
2904 |
# Add pdf to outputs
|
2905 |
output_paths.append(file_path)
|
|
|
2933 |
image_path = image_paths[page_python_format]
|
2934 |
|
2935 |
if isinstance(image_path, str):
|
2936 |
+
try:
|
2937 |
+
image = Image.open(image_path)
|
2938 |
+
except Exception:
|
2939 |
+
# print(f"Error opening image: {e}")
|
2940 |
+
|
2941 |
+
page_num, out_path, width, height = (
|
2942 |
+
process_single_page_for_image_conversion(
|
2943 |
+
pdf_path, page_python_format, input_folder=input_folder
|
2944 |
+
)
|
2945 |
+
)
|
2946 |
+
|
2947 |
+
image = Image.open(out_path)
|
2948 |
|
2949 |
image_page_width, image_page_height = image.size
|
2950 |
|
|
|
2975 |
|
2976 |
output_paths.append(out_file_path)
|
2977 |
|
2978 |
+
gr.Info(
|
2979 |
+
f"Review file saved to {out_file_path}. Now click on '1. Upload original pdf' to view the pdf with the annotations."
|
2980 |
+
)
|
2981 |
+
|
2982 |
return output_paths
|
tools/secure_regex_utils.py
CHANGED
@@ -86,7 +86,7 @@ def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
|
|
86 |
|
87 |
def safe_extract_page_number_from_path(path: str) -> Optional[int]:
|
88 |
"""
|
89 |
-
Safely extract page number from path containing _(
|
90 |
|
91 |
Args:
|
92 |
path: The path to extract page number from
|
|
|
86 |
|
87 |
def safe_extract_page_number_from_path(path: str) -> Optional[int]:
|
88 |
"""
|
89 |
+
Safely extract page number from path containing _(\\d+).png pattern.
|
90 |
|
91 |
Args:
|
92 |
path: The path to extract page number from
|