Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Feb 18

Commit

7907ad4

1 Parent(s): 82b9d9d

Laid groundwork for passing in AWS API keys. Duplicate pages option should now work for pages with no text.

Browse files

Files changed (5) hide show

app.py +6 -2
tools/aws_functions.py +9 -1
tools/aws_textract.py +8 -2
tools/file_redaction.py +25 -4
tools/find_duplicate_pages.py +2 -0

app.py CHANGED Viewed

@@ -341,6 +341,10 @@ with app:
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
@@ -362,12 +366,12 @@ with app:
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])

             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
+            with gr.Row():
+                aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=False)
+                aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=False)
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])

tools/aws_functions.py CHANGED Viewed

@@ -16,6 +16,14 @@ print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
 def get_assumed_role_info():
@@ -36,7 +44,7 @@ if RUN_AWS_FUNCTIONS == "1":
         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
         session = boto3.Session()
-        #print("session:", session)
     except Exception as e:
         print("Could not start boto3 session:", e)

 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
+AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
+if AWS_ACCESS_KEY:
+    print(f'AWS_ACCESS_KEY found in environment variables')
+AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
+if AWS_SECRET_KEY:
+    print(f'AWS_SECRET_KEY found in environment variables')
 def get_assumed_role_info():
         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
         session = boto3.Session()
+        #print("session:", session)
     except Exception as e:
         print("Could not start boto3 session:", e)

tools/aws_textract.py CHANGED Viewed

@@ -8,6 +8,7 @@ import time
 # Example: converting this single page to an image
 #from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 def extract_textract_metadata(response):
     """Extracts metadata from an AWS Textract response."""
@@ -30,8 +31,13 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
     Analyse page with AWS Textract
     '''
     if client == "":
-        try:
-            client = boto3.client('textract')
         except:
             print("Cannot connect to AWS Textract")
             return [], ""  # Return an empty list and an empty string

 # Example: converting this single page to an image
 #from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
+from tools.aws_functions import AWS_ACCESS_KEY, AWS_SECRET_KEY
 def extract_textract_metadata(response):
     """Extracts metadata from an AWS Textract response."""
     Analyse page with AWS Textract
     '''
     if client == "":
+        try:
+            if AWS_ACCESS_KEY and AWS_SECRET_KEY:
+                client = boto3.client('textract',
+                aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY)
+            else:
+                client = boto3.client('textract')
         except:
             print("Cannot connect to AWS Textract")
             return [], ""  # Return an empty list and an empty string

tools/file_redaction.py CHANGED Viewed

@@ -24,7 +24,7 @@ from gradio import Progress
 from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
-from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
@@ -40,6 +40,7 @@ print(f'The value of page_break_value is {page_break_value}')
 max_time_value = get_or_create_env_var('max_time_value', '999999')
 print(f'The value of max_time_value is {max_time_value}')
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
@@ -96,6 +97,8 @@ def choose_and_run_redactor(file_paths:List[str],
  comprehend_query_number:int=0,
  max_fuzzy_spelling_mistakes_num:int=1,
  match_fuzzy_whole_phrase_bool:bool=True,
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
@@ -129,8 +132,10 @@ def choose_and_run_redactor(file_paths:List[str],
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
-    -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
-    -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -242,6 +247,14 @@ def choose_and_run_redactor(file_paths:List[str],
         print("Trying to connect to AWS Comprehend service")
         if RUN_AWS_FUNCTIONS == "1":
             comprehend_client = boto3.client('comprehend')
         else:
             comprehend_client = ""
             out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
@@ -251,9 +264,17 @@ def choose_and_run_redactor(file_paths:List[str],
         comprehend_client = ""
     if in_redact_method == textract_option:
-        print("Trying to connect to AWS Comprehend service")
         if RUN_AWS_FUNCTIONS == "1":
             textract_client = boto3.client('textract')
         else:
             textract_client = ""
             out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."

 from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
+from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 max_time_value = get_or_create_env_var('max_time_value', '999999')
 print(f'The value of max_time_value is {max_time_value}')
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
  comprehend_query_number:int=0,
  max_fuzzy_spelling_mistakes_num:int=1,
  match_fuzzy_whole_phrase_bool:bool=True,
+ aws_access_key_textbox:str='',
+ aws_secret_key_textbox:str='',
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
+    - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
+    - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
         print("Trying to connect to AWS Comprehend service")
         if RUN_AWS_FUNCTIONS == "1":
             comprehend_client = boto3.client('comprehend')
+        elif aws_access_key_textbox and aws_secret_key_textbox:
+            comprehend_client = boto3.client('comprehend',
+                aws_access_key_id=aws_access_key_textbox,
+                aws_secret_access_key=aws_secret_key_textbox)
+        elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+            comprehend_client = boto3.client('comprehend',
+                aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY)
         else:
             comprehend_client = ""
             out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
         comprehend_client = ""
     if in_redact_method == textract_option:
+        print("Trying to connect to AWS Textract service")
         if RUN_AWS_FUNCTIONS == "1":
             textract_client = boto3.client('textract')
+        elif aws_access_key_textbox and aws_secret_key_textbox:
+            comprehend_client = boto3.client('textract',
+                aws_access_key_id=aws_access_key_textbox,
+                aws_secret_access_key=aws_secret_key_textbox)
+        elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
+            comprehend_client = boto3.client('textract',
+                aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY)
         else:
             textract_client = ""
             out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."

tools/find_duplicate_pages.py CHANGED Viewed

@@ -65,6 +65,8 @@ def combine_ocr_output_text(input_files):
         if 'page' not in df.columns or 'text' not in df.columns:
             print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
             continue
         # Group by page and concatenate text
         grouped = df.groupby('page')['text'].apply(' '.join).reset_index()

         if 'page' not in df.columns or 'text' not in df.columns:
             print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
             continue
+        df['text'] = df['text'].fillna('').astype(str)
         # Group by page and concatenate text
         grouped = df.groupby('page')['text'].apply(' '.join).reset_index()