Commit
·
7907ad4
1
Parent(s):
82b9d9d
Laid groundwork for passing in AWS API keys. Duplicate pages option should now work for pages with no text.
Browse files- app.py +6 -2
- tools/aws_functions.py +9 -1
- tools/aws_textract.py +8 -2
- tools/file_redaction.py +25 -4
- tools/find_duplicate_pages.py +2 -0
app.py
CHANGED
@@ -341,6 +341,10 @@ with app:
|
|
341 |
#with gr.Row():
|
342 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
343 |
|
|
|
|
|
|
|
|
|
344 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
345 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
346 |
|
@@ -362,12 +366,12 @@ with app:
|
|
362 |
|
363 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
364 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
365 |
-
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
|
366 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
367 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
368 |
|
369 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
370 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
|
371 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
372 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
373 |
|
|
|
341 |
#with gr.Row():
|
342 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
343 |
|
344 |
+
with gr.Row():
|
345 |
+
aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=False)
|
346 |
+
aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=False)
|
347 |
+
|
348 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
349 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
350 |
|
|
|
366 |
|
367 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
368 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
369 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
|
370 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
371 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
372 |
|
373 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
374 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
|
375 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
376 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
377 |
|
tools/aws_functions.py
CHANGED
@@ -16,6 +16,14 @@ print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
17 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
def get_assumed_role_info():
|
@@ -36,7 +44,7 @@ if RUN_AWS_FUNCTIONS == "1":
|
|
36 |
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
37 |
session = boto3.Session()
|
38 |
|
39 |
-
#print("session:", session)
|
40 |
|
41 |
except Exception as e:
|
42 |
print("Could not start boto3 session:", e)
|
|
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
17 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
18 |
|
19 |
+
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
|
20 |
+
if AWS_ACCESS_KEY:
|
21 |
+
print(f'AWS_ACCESS_KEY found in environment variables')
|
22 |
+
|
23 |
+
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
|
24 |
+
if AWS_SECRET_KEY:
|
25 |
+
print(f'AWS_SECRET_KEY found in environment variables')
|
26 |
+
|
27 |
|
28 |
|
29 |
def get_assumed_role_info():
|
|
|
44 |
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
45 |
session = boto3.Session()
|
46 |
|
47 |
+
#print("session:", session)
|
48 |
|
49 |
except Exception as e:
|
50 |
print("Could not start boto3 session:", e)
|
tools/aws_textract.py
CHANGED
@@ -8,6 +8,7 @@ import time
|
|
8 |
# Example: converting this single page to an image
|
9 |
#from pdf2image import convert_from_bytes
|
10 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
|
|
11 |
|
12 |
def extract_textract_metadata(response):
|
13 |
"""Extracts metadata from an AWS Textract response."""
|
@@ -30,8 +31,13 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
|
|
30 |
Analyse page with AWS Textract
|
31 |
'''
|
32 |
if client == "":
|
33 |
-
try:
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
35 |
except:
|
36 |
print("Cannot connect to AWS Textract")
|
37 |
return [], "" # Return an empty list and an empty string
|
|
|
8 |
# Example: converting this single page to an image
|
9 |
#from pdf2image import convert_from_bytes
|
10 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
11 |
+
from tools.aws_functions import AWS_ACCESS_KEY, AWS_SECRET_KEY
|
12 |
|
13 |
def extract_textract_metadata(response):
|
14 |
"""Extracts metadata from an AWS Textract response."""
|
|
|
31 |
Analyse page with AWS Textract
|
32 |
'''
|
33 |
if client == "":
|
34 |
+
try:
|
35 |
+
if AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
36 |
+
client = boto3.client('textract',
|
37 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
38 |
+
aws_secret_access_key=AWS_SECRET_KEY)
|
39 |
+
else:
|
40 |
+
client = boto3.client('textract')
|
41 |
except:
|
42 |
print("Cannot connect to AWS Textract")
|
43 |
return [], "" # Return an empty list and an empty string
|
tools/file_redaction.py
CHANGED
@@ -24,7 +24,7 @@ from gradio import Progress
|
|
24 |
from collections import defaultdict # For efficient grouping
|
25 |
|
26 |
from presidio_analyzer import RecognizerResult
|
27 |
-
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
@@ -40,6 +40,7 @@ print(f'The value of page_break_value is {page_break_value}')
|
|
40 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
41 |
print(f'The value of max_time_value is {max_time_value}')
|
42 |
|
|
|
43 |
def bounding_boxes_overlap(box1, box2):
|
44 |
"""Check if two bounding boxes overlap."""
|
45 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
@@ -96,6 +97,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
96 |
comprehend_query_number:int=0,
|
97 |
max_fuzzy_spelling_mistakes_num:int=1,
|
98 |
match_fuzzy_whole_phrase_bool:bool=True,
|
|
|
|
|
99 |
output_folder:str=output_folder,
|
100 |
progress=gr.Progress(track_tqdm=True)):
|
101 |
'''
|
@@ -129,8 +132,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
129 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
130 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
131 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
132 |
-
-
|
133 |
-
-
|
|
|
|
|
134 |
- output_folder (str, optional): Output folder for results.
|
135 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
136 |
|
@@ -242,6 +247,14 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
242 |
print("Trying to connect to AWS Comprehend service")
|
243 |
if RUN_AWS_FUNCTIONS == "1":
|
244 |
comprehend_client = boto3.client('comprehend')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
else:
|
246 |
comprehend_client = ""
|
247 |
out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
|
@@ -251,9 +264,17 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
251 |
comprehend_client = ""
|
252 |
|
253 |
if in_redact_method == textract_option:
|
254 |
-
print("Trying to connect to AWS
|
255 |
if RUN_AWS_FUNCTIONS == "1":
|
256 |
textract_client = boto3.client('textract')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
else:
|
258 |
textract_client = ""
|
259 |
out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
|
|
|
24 |
from collections import defaultdict # For efficient grouping
|
25 |
|
26 |
from presidio_analyzer import RecognizerResult
|
27 |
+
from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
|
|
40 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
41 |
print(f'The value of max_time_value is {max_time_value}')
|
42 |
|
43 |
+
|
44 |
def bounding_boxes_overlap(box1, box2):
|
45 |
"""Check if two bounding boxes overlap."""
|
46 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
|
|
97 |
comprehend_query_number:int=0,
|
98 |
max_fuzzy_spelling_mistakes_num:int=1,
|
99 |
match_fuzzy_whole_phrase_bool:bool=True,
|
100 |
+
aws_access_key_textbox:str='',
|
101 |
+
aws_secret_key_textbox:str='',
|
102 |
output_folder:str=output_folder,
|
103 |
progress=gr.Progress(track_tqdm=True)):
|
104 |
'''
|
|
|
132 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
133 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
134 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
135 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
136 |
+
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
137 |
+
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
138 |
+
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
139 |
- output_folder (str, optional): Output folder for results.
|
140 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
141 |
|
|
|
247 |
print("Trying to connect to AWS Comprehend service")
|
248 |
if RUN_AWS_FUNCTIONS == "1":
|
249 |
comprehend_client = boto3.client('comprehend')
|
250 |
+
elif aws_access_key_textbox and aws_secret_key_textbox:
|
251 |
+
comprehend_client = boto3.client('comprehend',
|
252 |
+
aws_access_key_id=aws_access_key_textbox,
|
253 |
+
aws_secret_access_key=aws_secret_key_textbox)
|
254 |
+
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
255 |
+
comprehend_client = boto3.client('comprehend',
|
256 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
257 |
+
aws_secret_access_key=AWS_SECRET_KEY)
|
258 |
else:
|
259 |
comprehend_client = ""
|
260 |
out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
|
|
|
264 |
comprehend_client = ""
|
265 |
|
266 |
if in_redact_method == textract_option:
|
267 |
+
print("Trying to connect to AWS Textract service")
|
268 |
if RUN_AWS_FUNCTIONS == "1":
|
269 |
textract_client = boto3.client('textract')
|
270 |
+
elif aws_access_key_textbox and aws_secret_key_textbox:
|
271 |
+
comprehend_client = boto3.client('textract',
|
272 |
+
aws_access_key_id=aws_access_key_textbox,
|
273 |
+
aws_secret_access_key=aws_secret_key_textbox)
|
274 |
+
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
275 |
+
comprehend_client = boto3.client('textract',
|
276 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
277 |
+
aws_secret_access_key=AWS_SECRET_KEY)
|
278 |
else:
|
279 |
textract_client = ""
|
280 |
out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
|
tools/find_duplicate_pages.py
CHANGED
@@ -65,6 +65,8 @@ def combine_ocr_output_text(input_files):
|
|
65 |
if 'page' not in df.columns or 'text' not in df.columns:
|
66 |
print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
|
67 |
continue
|
|
|
|
|
68 |
|
69 |
# Group by page and concatenate text
|
70 |
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
|
|
|
65 |
if 'page' not in df.columns or 'text' not in df.columns:
|
66 |
print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
|
67 |
continue
|
68 |
+
|
69 |
+
df['text'] = df['text'].fillna('').astype(str)
|
70 |
|
71 |
# Group by page and concatenate text
|
72 |
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
|