seanpedrickcase commited on
Commit
7907ad4
·
1 Parent(s): 82b9d9d

Laid groundwork for passing in AWS API keys. Duplicate pages option should now work for pages with no text.

Browse files
app.py CHANGED
@@ -341,6 +341,10 @@ with app:
341
  #with gr.Row():
342
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
343
 
 
 
 
 
344
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
345
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
346
 
@@ -362,12 +366,12 @@ with app:
362
 
363
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
364
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
365
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
366
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
367
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
368
 
369
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
370
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
371
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
372
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
373
 
 
341
  #with gr.Row():
342
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
343
 
344
+ with gr.Row():
345
+ aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=False)
346
+ aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=False)
347
+
348
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
349
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
350
 
 
366
 
367
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
368
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
369
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
370
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
371
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
372
 
373
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
374
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
375
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
376
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
377
 
tools/aws_functions.py CHANGED
@@ -16,6 +16,14 @@ print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
16
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
17
  print(f'The value of AWS_REGION is {AWS_REGION}')
18
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  def get_assumed_role_info():
@@ -36,7 +44,7 @@ if RUN_AWS_FUNCTIONS == "1":
36
  bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
37
  session = boto3.Session()
38
 
39
- #print("session:", session)
40
 
41
  except Exception as e:
42
  print("Could not start boto3 session:", e)
 
16
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
17
  print(f'The value of AWS_REGION is {AWS_REGION}')
18
 
19
+ AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
20
+ if AWS_ACCESS_KEY:
21
+ print(f'AWS_ACCESS_KEY found in environment variables')
22
+
23
+ AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
24
+ if AWS_SECRET_KEY:
25
+ print(f'AWS_SECRET_KEY found in environment variables')
26
+
27
 
28
 
29
  def get_assumed_role_info():
 
44
  bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
45
  session = boto3.Session()
46
 
47
+ #print("session:", session)
48
 
49
  except Exception as e:
50
  print("Could not start boto3 session:", e)
tools/aws_textract.py CHANGED
@@ -8,6 +8,7 @@ import time
8
  # Example: converting this single page to an image
9
  #from pdf2image import convert_from_bytes
10
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 
11
 
12
  def extract_textract_metadata(response):
13
  """Extracts metadata from an AWS Textract response."""
@@ -30,8 +31,13 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
30
  Analyse page with AWS Textract
31
  '''
32
  if client == "":
33
- try:
34
- client = boto3.client('textract')
 
 
 
 
 
35
  except:
36
  print("Cannot connect to AWS Textract")
37
  return [], "" # Return an empty list and an empty string
 
8
  # Example: converting this single page to an image
9
  #from pdf2image import convert_from_bytes
10
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
11
+ from tools.aws_functions import AWS_ACCESS_KEY, AWS_SECRET_KEY
12
 
13
  def extract_textract_metadata(response):
14
  """Extracts metadata from an AWS Textract response."""
 
31
  Analyse page with AWS Textract
32
  '''
33
  if client == "":
34
+ try:
35
+ if AWS_ACCESS_KEY and AWS_SECRET_KEY:
36
+ client = boto3.client('textract',
37
+ aws_access_key_id=AWS_ACCESS_KEY,
38
+ aws_secret_access_key=AWS_SECRET_KEY)
39
+ else:
40
+ client = boto3.client('textract')
41
  except:
42
  print("Cannot connect to AWS Textract")
43
  return [], "" # Return an empty list and an empty string
tools/file_redaction.py CHANGED
@@ -24,7 +24,7 @@ from gradio import Progress
24
  from collections import defaultdict # For efficient grouping
25
 
26
  from presidio_analyzer import RecognizerResult
27
- from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
@@ -40,6 +40,7 @@ print(f'The value of page_break_value is {page_break_value}')
40
  max_time_value = get_or_create_env_var('max_time_value', '999999')
41
  print(f'The value of max_time_value is {max_time_value}')
42
 
 
43
  def bounding_boxes_overlap(box1, box2):
44
  """Check if two bounding boxes overlap."""
45
  return (box1[0] < box2[2] and box2[0] < box1[2] and
@@ -96,6 +97,8 @@ def choose_and_run_redactor(file_paths:List[str],
96
  comprehend_query_number:int=0,
97
  max_fuzzy_spelling_mistakes_num:int=1,
98
  match_fuzzy_whole_phrase_bool:bool=True,
 
 
99
  output_folder:str=output_folder,
100
  progress=gr.Progress(track_tqdm=True)):
101
  '''
@@ -129,8 +132,10 @@ def choose_and_run_redactor(file_paths:List[str],
129
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
130
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
131
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
132
- - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
133
- - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
 
 
134
  - output_folder (str, optional): Output folder for results.
135
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
136
 
@@ -242,6 +247,14 @@ def choose_and_run_redactor(file_paths:List[str],
242
  print("Trying to connect to AWS Comprehend service")
243
  if RUN_AWS_FUNCTIONS == "1":
244
  comprehend_client = boto3.client('comprehend')
 
 
 
 
 
 
 
 
245
  else:
246
  comprehend_client = ""
247
  out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
@@ -251,9 +264,17 @@ def choose_and_run_redactor(file_paths:List[str],
251
  comprehend_client = ""
252
 
253
  if in_redact_method == textract_option:
254
- print("Trying to connect to AWS Comprehend service")
255
  if RUN_AWS_FUNCTIONS == "1":
256
  textract_client = boto3.client('textract')
 
 
 
 
 
 
 
 
257
  else:
258
  textract_client = ""
259
  out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
 
24
  from collections import defaultdict # For efficient grouping
25
 
26
  from presidio_analyzer import RecognizerResult
27
+ from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 
40
  max_time_value = get_or_create_env_var('max_time_value', '999999')
41
  print(f'The value of max_time_value is {max_time_value}')
42
 
43
+
44
  def bounding_boxes_overlap(box1, box2):
45
  """Check if two bounding boxes overlap."""
46
  return (box1[0] < box2[2] and box2[0] < box1[2] and
 
97
  comprehend_query_number:int=0,
98
  max_fuzzy_spelling_mistakes_num:int=1,
99
  match_fuzzy_whole_phrase_bool:bool=True,
100
+ aws_access_key_textbox:str='',
101
+ aws_secret_key_textbox:str='',
102
  output_folder:str=output_folder,
103
  progress=gr.Progress(track_tqdm=True)):
104
  '''
 
132
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
133
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
134
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
135
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
136
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
137
+ - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
138
+ - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
139
  - output_folder (str, optional): Output folder for results.
140
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
141
 
 
247
  print("Trying to connect to AWS Comprehend service")
248
  if RUN_AWS_FUNCTIONS == "1":
249
  comprehend_client = boto3.client('comprehend')
250
+ elif aws_access_key_textbox and aws_secret_key_textbox:
251
+ comprehend_client = boto3.client('comprehend',
252
+ aws_access_key_id=aws_access_key_textbox,
253
+ aws_secret_access_key=aws_secret_key_textbox)
254
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
255
+ comprehend_client = boto3.client('comprehend',
256
+ aws_access_key_id=AWS_ACCESS_KEY,
257
+ aws_secret_access_key=AWS_SECRET_KEY)
258
  else:
259
  comprehend_client = ""
260
  out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
 
264
  comprehend_client = ""
265
 
266
  if in_redact_method == textract_option:
267
+ print("Trying to connect to AWS Textract service")
268
  if RUN_AWS_FUNCTIONS == "1":
269
  textract_client = boto3.client('textract')
270
+ elif aws_access_key_textbox and aws_secret_key_textbox:
271
+ comprehend_client = boto3.client('textract',
272
+ aws_access_key_id=aws_access_key_textbox,
273
+ aws_secret_access_key=aws_secret_key_textbox)
274
+ elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
275
+ comprehend_client = boto3.client('textract',
276
+ aws_access_key_id=AWS_ACCESS_KEY,
277
+ aws_secret_access_key=AWS_SECRET_KEY)
278
  else:
279
  textract_client = ""
280
  out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
tools/find_duplicate_pages.py CHANGED
@@ -65,6 +65,8 @@ def combine_ocr_output_text(input_files):
65
  if 'page' not in df.columns or 'text' not in df.columns:
66
  print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
67
  continue
 
 
68
 
69
  # Group by page and concatenate text
70
  grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
 
65
  if 'page' not in df.columns or 'text' not in df.columns:
66
  print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
67
  continue
68
+
69
+ df['text'] = df['text'].fillna('').astype(str)
70
 
71
  # Group by page and concatenate text
72
  grouped = df.groupby('page')['text'].apply(' '.join).reset_index()