Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 15, 2024

Commit

01c88c0

1 Parent(s): e08f9b8

Added logging, anonymising all Excel sheets, simple redaction tags, some Dockerfile optimisation

Browse files

Files changed (9) hide show

.dockerignore +1 -0
.gitignore +1 -0
Dockerfile +26 -19
app.py +40 -20
requirements.txt +6 -6
tools/data_anonymise.py +161 -67
tools/file_conversion.py +23 -13
tools/file_redaction.py +37 -11
tools/helper_functions.py +36 -7

.dockerignore CHANGED Viewed

@@ -14,4 +14,5 @@ poppler/*
 build/*
 dist/*
 build_deps/*
 doc_redaction_amplify_app/*

 build/*
 dist/*
 build_deps/*
+logs/*
 doc_redaction_amplify_app/*

.gitignore CHANGED Viewed

@@ -14,4 +14,5 @@ poppler/*
 build/*
 dist/*
 build_deps/*
 doc_redaction_amplify_app/*

 build/*
 dist/*
 build_deps/*
+logs/*
 doc_redaction_amplify_app/*

Dockerfile CHANGED Viewed

@@ -1,12 +1,8 @@
-FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 # Install system dependencies. Need to specify -y for poppler to get it to install
 RUN apt-get update \
-    && apt-get install -y \
-        tesseract-ocr -y \
-        poppler-utils -y \
-		libgl1-mesa-glx -y \
-		libglib2.0-0 -y \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -14,19 +10,34 @@ WORKDIR /src
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install --no-cache-dir gradio==4.36.1
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
-# Change ownership of /home/user directory
-#RUN chown -R user:user /home/user
 # Make output folder
-RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
-RUN mkdir -p /home/user/app/tld && chown -R user:user /home/user/app/tld
 # Switch to the "user" user
 USER user
@@ -34,18 +45,15 @@ USER user
 # Set environmental variables
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
-    PYTHONPATH=$HOME/app \
 	PYTHONUNBUFFERED=1 \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
 	TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
-	#GRADIO_TEMP_DIR=$HOME/tmp \
-	#GRADIO_ROOT_PATH=/address-match \
-	# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
-	KEEP_ALIVE=60 \
 	SYSTEM=spaces
 # Set the working directory to the user's home directory
@@ -53,6 +61,5 @@ WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
-#COPY . $HOME/app
 CMD ["python", "app.py"]

+# Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
 # Install system dependencies. Need to specify -y for poppler to get it to install
 RUN apt-get update \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
+RUN pip install --no-cache-dir --target=/install -r requirements.txt
+RUN rm requirements.txt
+# Stage 2: Final runtime image
+FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
+# Install system dependencies. Need to specify -y for poppler to get it to install
+RUN apt-get update \
+    && apt-get install -y \
+        tesseract-ocr \
+        poppler-utils \
+        libgl1-mesa-glx \
+        libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
 # Make output folder
+RUN mkdir -p /home/user/app/output \
+&& mkdir -p /home/user/app/tld \
+&& mkdir -p /home/user/app/logs \
+&& chown -R user:user /home/user/app
+# Copy installed packages from builder stage
+COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 # Switch to the "user" user
 USER user
 # Set environmental variables
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=/home/user/app \
 	PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
 	TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
 	SYSTEM=spaces
 # Set the working directory to the user's home directory
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
 CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf
-from tools.data_anonymise import do_anonymise
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
@@ -28,6 +28,7 @@ with app:
     prepared_pdf_state = gr.State([])
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -51,7 +52,8 @@ with app:
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
-            output_file = gr.File(label="Output file")
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
@@ -64,16 +66,19 @@ with app:
         )
         with gr.Accordion("Paste open text", open = False):
             in_text = gr.Textbox(label="Enter open text", lines=10)
-        with gr.Accordion("Upload xlsx (first sheet read only) or csv file(s)", open = False):
-            in_file_text = gr.File(label="Choose an xlsx (first sheet read only) or csv files", file_count= "multiple", file_types=['.xlsx', '.csv', '.parquet', '.csv.gz'])
-        in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select columns that you want to anonymise. Ensure that at least one named column exists in all files.")
-        match_btn = gr.Button("Anonymise text", variant="primary")
         with gr.Row():
             text_output_summary = gr.Textbox(label="Output result")
-            text_output_file = gr.File(label="Output file")
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
@@ -83,13 +88,16 @@ with app:
         with gr.Accordion("Settings for documents", open = True):
             in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
-            anon_strat = gr.Radio(choices=["replace", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
                 in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
     # AWS options - not yet implemented
     # with gr.Tab(label="Advanced options"):
@@ -104,26 +112,38 @@ with app:
     # ### Loading AWS data ###
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
     # Document redaction
-    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
                     outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
-                    outputs=[output_summary, output_file, output_file_list_state], api_name="redact_doc")#.\
-                    #then(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
-                    #outputs=[output_summary, output_file])
-    #convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
-    #                outputs=[output_summary, output_file], api_name="convert_to_img")
-     # Open text interaction
-    in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
-    match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
-    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
 # Launch the Gradio app
-COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '1')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":

 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf
+from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
     prepared_pdf_state = gr.State([])
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
+    text_output_file_list_state = gr.State([])
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
+            output_file = gr.File(label="Output files")
+            text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
         )
         with gr.Accordion("Paste open text", open = False):
             in_text = gr.Textbox(label="Enter open text", lines=10)
+        with gr.Accordion("Upload xlsx or csv files", open = True):
+            in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
+        in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
+        in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
+        tabular_data_redact_btn = gr.Button("Anonymise text", variant="primary")
         with gr.Row():
             text_output_summary = gr.Textbox(label="Output result")
+            text_output_file = gr.File(label="Output files")
+            text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
         with gr.Accordion("Settings for documents", open = True):
             in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
+            anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
         with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
             in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
                 in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
+        # Invisible text box to hold the session hash/username just for logging purposes
+        session_hash_textbox = gr.Textbox(value="", visible=False)
     # AWS options - not yet implemented
     # with gr.Tab(label="Advanced options"):
     # ### Loading AWS data ###
     # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
+    callback = gr.CSVLogger()
     # Document redaction
+    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
                     outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
+    # If the output file count text box changes, keep going with redacting each document until done
+    text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
+                    outputs=[output_summary, prepared_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done])
+     # Tabular data redaction
+    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
+    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
+    # If the output file count text box changes, keep going with redacting each data file until done
+    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
+    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
+    # This needs to be called at some point prior to the first call to callback.flag()
+    callback.setup([session_hash_textbox], "logs")
+    #app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
+    session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
 # Launch the Gradio app
+COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -6,10 +6,10 @@ presidio_anonymizer==2.2.354
 presidio-image-redactor==0.0.52
 pikepdf==8.15.1
 pandas==2.2.2
-spacy # Not specified as latest versions create a conflict with latest versions of gradio
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
-gradio # Not specified as latest versions create a conflict with latest versions of spacy
-boto3==1.34.103
-faker
-openpyxl
-pyarrow

 presidio-image-redactor==0.0.52
 pikepdf==8.15.1
 pandas==2.2.2
+spacy==3.7.5
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
+gradio>=4.26.0
+boto3==1.34.158
+pyarrow==14.0.2
+openpyxl==3.1.2
+Faker==22.2.0

tools/data_anonymise.py CHANGED Viewed

@@ -11,9 +11,9 @@ from typing import List
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
-from presidio_anonymizer.entities import OperatorConfig
-from tools.helper_functions import output_folder, get_file_path_end, read_file
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 # Use custom version of analyze_dict to be able to track progress
@@ -116,17 +116,20 @@ def anon_consistent_names(df):
     return scrubbed_df_consistent_names
-def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
     # DataFrame to dict
     df_dict = df.to_dict(orient="list")
-    if allow_list:
-        allow_list_flat = [item for sublist in allow_list for item in sublist]
     #analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
-    anonymizer = AnonymizerEngine()
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
@@ -134,19 +137,19 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     #                                                         entities=chosen_redact_entities,
     #                                                         score_threshold=score_threshold,
     #                                                         return_decision_process=False,
-    #                                                         allow_list=allow_list_flat)
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
-    print("Allow list:", allow_list)
     # Use custom analyzer to be able to track progress with Gradio
     analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
                                                             entities=chosen_redact_entities,
                                                             score_threshold=score_threshold,
                                                             return_decision_process=False,
-                                                            allow_list=allow_list_flat)
     analyzer_results = list(analyzer_results)
     #analyzer_results
@@ -154,9 +157,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
-    # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
-    key = secrets.token_bytes(16)  # 128 bits = 16 bytes
-    key_string = base64.b64encode(key).decode('utf-8')
     # Create faker function (note that it has to receive a value)
@@ -166,6 +167,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
         return fake.first_name()
     # Set up the anonymization configuration WITHOUT DATE_TIME
     replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
     redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
     hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
@@ -173,12 +175,16 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
     fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
-    if anon_strat == "replace": chosen_mask_config = replace_config
     if anon_strat == "redact": chosen_mask_config = redact_config
     if anon_strat == "hash": chosen_mask_config = hash_config
     if anon_strat == "mask": chosen_mask_config = mask_config
-    if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
     elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
     # I think in general people will want to keep date / times
@@ -190,17 +196,10 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
     scrubbed_df = pd.DataFrame(anonymizer_results)
-    # Create reporting message
-    out_message = "Successfully anonymised"
-    if anon_strat == "encrypt":
-        out_message = out_message + ". Your decryption key is " + key_string + "."
-    return scrubbed_df, out_message
-def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
@@ -221,69 +220,164 @@ def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], la
                 common_strings.append(string)
         return common_strings
     # Load file
     anon_df = pd.DataFrame()
-    out_files_list = []
     # Check if files and text exist
-    if not in_file:
         if in_text:
-            in_file=['open_text']
         else:
             out_message = "Please enter text or a file to redact."
-            return out_message, None
-    for match_file in progress.tqdm(in_file, desc="Anonymising files", unit = "file"):
-        if match_file=='open_text':
             anon_df = pd.DataFrame(data={'text':[in_text]})
             chosen_cols=['text']
-            out_file_part = match_file
         else:
-            anon_df = read_file(match_file)
-            out_file_part = get_file_path_end(match_file.name)
-        # Check for chosen col, skip file if not found
-        all_cols_original_order = list(anon_df.columns)
-        any_cols_found = check_lists(chosen_cols, all_cols_original_order)
-        if any_cols_found == False:
-            out_message = "No chosen columns found in dataframe: " + out_file_part
-            print(out_message)
-            continue
-        else:
-            chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
-        # Split dataframe to keep only selected columns
-        print("Remaining columns to redact:", chosen_cols_in_anon_df)
-        anon_df_part = anon_df[chosen_cols_in_anon_df]
-        anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
-        # Anonymise the selected columns
-        anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, allow_list)
-        # Rejoin the dataframe together
-        anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
-        anon_df_out = anon_df_out[all_cols_original_order]
-        # Export file
-        # out_file_part = re.sub(r'\.csv', '', match_file.name)
-        anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat + ".csv"
-        anon_df_out.to_csv(anon_export_file_name, index = None)
-        out_files_list.append(anon_export_file_name)
-        # Print result text to output text box if just anonymising open text
-        if match_file=='open_text':
-            out_message = anon_df_out['text'][0]
-    return out_message, out_files_list

 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
+from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 # Use custom version of analyze_dict to be able to track progress
     return scrubbed_df_consistent_names
+def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
+    key_string = ""
     # DataFrame to dict
     df_dict = df.to_dict(orient="list")
+    if in_allow_list:
+        in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     #analyzer = nlp_analyser #AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+    anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
     #                                                         entities=chosen_redact_entities,
     #                                                         score_threshold=score_threshold,
     #                                                         return_decision_process=False,
+    #                                                         in_allow_list=in_allow_list_flat)
     print("Identifying personal information")
     analyse_tic = time.perf_counter()
+    print("Allow list:", in_allow_list)
     # Use custom analyzer to be able to track progress with Gradio
     analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
                                                             entities=chosen_redact_entities,
                                                             score_threshold=score_threshold,
                                                             return_decision_process=False,
+                                                            allow_list=in_allow_list_flat)
     analyzer_results = list(analyzer_results)
     #analyzer_results
     analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
     print(analyse_time_out)
     # Create faker function (note that it has to receive a value)
         return fake.first_name()
     # Set up the anonymization configuration WITHOUT DATE_TIME
+    simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
     replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
     redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
     hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
     people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
     fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
+    if anon_strat == "replace with <REDACTED>": chosen_mask_config = simple_replace_config
+    if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
     if anon_strat == "redact": chosen_mask_config = redact_config
     if anon_strat == "hash": chosen_mask_config = hash_config
     if anon_strat == "mask": chosen_mask_config = mask_config
+    if anon_strat == "encrypt":
+        chosen_mask_config = people_encrypt_config
+        # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
+        key = secrets.token_bytes(16)  # 128 bits = 16 bytes
+        key_string = base64.b64encode(key).decode('utf-8')
     elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
     # I think in general people will want to keep date / times
     anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
     scrubbed_df = pd.DataFrame(anonymizer_results)
+    return scrubbed_df, key_string
+def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
                 common_strings.append(string)
         return common_strings
+    # Check for chosen col, skip file if not found
+    all_cols_original_order = list(anon_df.columns)
+    any_cols_found = check_lists(chosen_cols, all_cols_original_order)
+    if any_cols_found == False:
+        out_message = "No chosen columns found in dataframe: " + out_file_part
+        print(out_message)
+    else:
+        chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
+    # Split dataframe to keep only selected columns
+    print("Remaining columns to redact:", chosen_cols_in_anon_df)
+    anon_df_part = anon_df[chosen_cols_in_anon_df]
+    anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
+    # Anonymise the selected columns
+    anon_df_part_out, key_string = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
+    # Rejoin the dataframe together
+    anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
+    anon_df_out = anon_df_out[all_cols_original_order]
+    # Export file
+    #  Rename anonymisation strategy for file path naming
+    if anon_strat == "replace with <REDACTED>": anon_strat_txt = "redact_simple"
+    elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
+    else: anon_strat_txt = anon_strat
+    # If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
+    if file_type == 'xlsx':
+        anon_export_file_name = anon_xlsx_export_file_name
+        # Create a Pandas Excel writer using XlsxWriter as the engine.
+        with pd.ExcelWriter(anon_xlsx_export_file_name, engine='openpyxl', mode='a') as writer:
+            # Write each DataFrame to a different worksheet.
+            anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
+    else:
+        anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
+        anon_df_out.to_csv(anon_export_file_name, index = None)
+    out_file_paths.append(anon_export_file_name)
+    # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
+    out_file_paths = list(set(out_file_paths))
+    # Print result text to output text box if just anonymising open text
+    if anon_file=='open_text':
+        out_message = [anon_df_out['text'][0]]
+    return out_file_paths, out_message, key_string
+def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], progress=Progress(track_tqdm=True)):
+    tic = time.perf_counter()
     # Load file
+    # If out message or out_file_paths are blank, change to a list so it can be appended to
+    if isinstance(out_message, str):
+        out_message = [out_message]
+    if not out_file_paths:
+        out_file_paths = []
+    if in_allow_list:
+        in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     anon_df = pd.DataFrame()
+    #out_file_paths = []
     # Check if files and text exist
+    if not file_paths:
         if in_text:
+            file_paths=['open_text']
         else:
             out_message = "Please enter text or a file to redact."
+            return out_message, out_file_paths, out_file_paths, latest_file_completed
+    # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed == len(file_paths):
+        print("Last file reached, returning files:", str(latest_file_completed))
+        final_out_message = '\n'.join(out_message)
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed
+    file_path_loop = [file_paths[int(latest_file_completed)]]
+    for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
+        if anon_file=='open_text':
             anon_df = pd.DataFrame(data={'text':[in_text]})
             chosen_cols=['text']
+            out_file_part = anon_file
         else:
+            # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
+            file_type = detect_file_type(anon_file)
+            print("File type is:", file_type)
+            out_file_part = get_file_path_end(anon_file.name)
+            if file_type == 'xlsx':
+                print("Running through all xlsx sheets")
+                #anon_xlsx = pd.ExcelFile(anon_file)
+                if not in_excel_sheets:
+                    out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
+                    continue
+                anon_xlsx = pd.ExcelFile(anon_file)
+                # Create xlsx file:
+                anon_xlsx_export_file_name = output_folder + out_file_part + ".xlsx"
+                from openpyxl import Workbook
+                wb = Workbook()
+                wb.save(anon_xlsx_export_file_name)
+                # Iterate through the sheet names
+                for sheet_name in in_excel_sheets:
+                    # Read each sheet into a DataFrame
+                    if sheet_name not in anon_xlsx.sheet_names:
+                        continue
+                    anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
+                    # Process the DataFrame (e.g., print its contents)
+                    print(f"Sheet Name: {sheet_name}")
+                    print(anon_df.head())  # Print the first few rows
+                    out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type,  anon_xlsx_export_file_name)
+            else:
+                sheet_name = ""
+                anon_df = read_file(anon_file)
+                out_file_part = get_file_path_end(anon_file.name)
+                out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "")
+        # Increase latest file completed count unless we are at the last file
+        if latest_file_completed != len(file_paths):
+            print("Completed file number:", str(latest_file_completed))
+            latest_file_completed += 1
+        toc = time.perf_counter()
+        out_time = f"in {toc - tic:0.1f} seconds."
+        print(out_time)
+        if anon_strat == "encrypt":
+            out_message.append(". Your decryption key is " + key_string + ".")
+        out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
+        out_message_out = '\n'.join(out_message)
+        out_message_out = out_message_out + " " + out_time
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed

tools/file_conversion.py CHANGED Viewed

@@ -89,15 +89,31 @@ def process_file(file_path):
     return img_object
-def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
-    out_message = ''
-    out_file_paths = []
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
-    for file in file_paths:
         file_path = file.name
         #if file_path:
@@ -112,7 +128,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 print(out_message)
-                return out_message, None
             out_file_path = process_file(file_path)
             print("Out file path at image conversion step:", out_file_path)
@@ -121,7 +137,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)
-                return out_message, None
             out_file_path = file_path
@@ -151,10 +167,4 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     print("Out file paths:", out_file_paths)
-    return out_message, out_file_paths

     return img_object
+def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], progress=Progress(track_tqdm=True)):
+    # If out message or out_file_paths are blank, change to a list so it can be appended to
+    #if isinstance(out_message, str):
+    #    out_message = [out_message]
+    if not file_paths:
+        file_paths = []
+    out_file_paths = file_paths
+    latest_file_completed = int(latest_file_completed)
+    # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed == len(out_file_paths):
+        print("Last file reached, returning files:", str(latest_file_completed))
+        #final_out_message = '\n'.join(out_message)
+        return out_message, out_file_paths
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    file_paths_loop = [out_file_paths[int(latest_file_completed)]]
     #for file in progress.tqdm(file_paths, desc="Preparing files"):
+    for file in file_paths_loop:
         file_path = file.name
         #if file_path:
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 print(out_message)
+                return out_message, out_file_paths
             out_file_path = process_file(file_path)
             print("Out file path at image conversion step:", out_file_path)
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)
+                return out_message, out_file_paths
             out_file_path = file_path
     print("Out file paths:", out_file_paths)
+    return out_message, out_file_paths

tools/file_redaction.py CHANGED Viewed

@@ -17,20 +17,36 @@ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_
 import gradio as gr
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
-    out_message = []
-    out_file_paths = []
     if in_allow_list:
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     print("File paths:", file_paths)
-    for file in progress.tqdm(file_paths, desc="Redacting files", unit = "files"):
         file_path = file.name
         if file_path:
@@ -42,7 +58,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No file selected"
             print(out_message)
-            return out_message, out_file_paths
         if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
@@ -57,6 +73,11 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             out_file_paths.append(out_image_file_path)
             out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
         elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:
                 return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
@@ -81,21 +102,26 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             out_file_paths.extend(img_output_file_path)
             # Add confirmation for converting to image if you want
-            # out_message.append(img_output_summary)
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths
     toc = time.perf_counter()
-    out_time = f"Time taken: {toc - tic:0.1f} seconds."
     print(out_time)
     out_message_out = '\n'.join(out_message)
-    out_message_out = out_message_out + "\n\n" + out_time
-    return out_message_out, out_file_paths, out_file_paths
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []

 import gradio as gr
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
+    # If out message is string or out_file_paths are blank, change to a list so it can be appended to
+    if isinstance(out_message, str):
+        out_message = [out_message]
+    if not out_file_paths:
+        out_file_paths = []
+    print("Latest file completed is:", str(latest_file_completed))
+    latest_file_completed = int(latest_file_completed)
+    # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed == len(file_paths):
+        print("Last file reached, returning files:", str(latest_file_completed))
+        final_out_message = '\n'.join(out_message)
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed
+    file_paths_loop = [file_paths[int(latest_file_completed)]]
     if in_allow_list:
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     print("File paths:", file_paths)
+    for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
         if file_path:
         else:
             out_message = "No file selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed
         if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
             out_file_paths.append(out_image_file_path)
             out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
+            # Increase latest file completed count unless we are at the last file
+            if latest_file_completed != len(file_paths):
+                print("Completed file number:", str(latest_file_completed))
+                latest_file_completed += 1
         elif in_redact_method == "Text analysis":
             if is_pdf(file_path) == False:
                 return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
             out_file_paths.extend(img_output_file_path)
             # Add confirmation for converting to image if you want
+            # out_message.append(img_output_summary)
+            if latest_file_completed != len(file_paths):
+                print("Completed file number:", str(latest_file_completed))
+                latest_file_completed += 1
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed
     toc = time.perf_counter()
+    out_time = f"in {toc - tic:0.1f} seconds."
     print(out_time)
     out_message_out = '\n'.join(out_message)
+    out_message_out = out_message_out + " " + out_time
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []

tools/helper_functions.py CHANGED Viewed

@@ -76,17 +76,46 @@ def ensure_output_folder_exists():
 def put_columns_in_df(in_file):
     new_choices = []
     concat_choices = []
     for file in in_file:
-        df = read_file(file.name)
-        new_choices = list(df.columns)
-        concat_choices.extend(new_choices)
     # Drop duplicate columns
     concat_choices = list(set(concat_choices))
-    return gr.Dropdown(choices=concat_choices, value=concat_choices)
 # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):
@@ -104,7 +133,7 @@ def add_folder_to_path(folder_path: str):
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
             os.environ['PATH'] = full_path_extension
-            print(f"Updated PATH with: ", full_path_extension)
         else:
             print(f"Directory {folder_path} already exists in PATH.")
     else:
@@ -167,7 +196,7 @@ async def get_connection_params(request: gr.Request):
         #if bucket_name:
         #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
-        return out_session_hash, output_folder
     else:
         print("No session parameters found.")
         return "",""

 def put_columns_in_df(in_file):
     new_choices = []
     concat_choices = []
+    all_sheet_names = []
+    number_of_excel_files = 0
     for file in in_file:
+        file_name = file.name
+        file_type = detect_file_type(file_name)
+        print("File type is:", file_type)
+        if file_type == 'xlsx':
+            number_of_excel_files += 1
+            new_choices = []
+            print("Running through all xlsx sheets")
+            anon_xlsx = pd.ExcelFile(file_name)
+            new_sheet_names = anon_xlsx.sheet_names
+            # Iterate through the sheet names
+            for sheet_name in new_sheet_names:
+                # Read each sheet into a DataFrame
+                df = pd.read_excel(file_name, sheet_name=sheet_name)
+                # Process the DataFrame (e.g., print its contents)
+                print(f"Sheet Name: {sheet_name}")
+                print(df.head())  # Print the first few rows
+                new_choices.extend(list(df.columns))
+            all_sheet_names.extend(new_sheet_names)
+        else:
+            df = read_file(file_name)
+            new_choices = list(df.columns)
+        concat_choices.extend(new_choices)
     # Drop duplicate columns
     concat_choices = list(set(concat_choices))
+    if number_of_excel_files > 0:
+        return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names, visible=True)
+    else:
+        return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
 # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):
         if absolute_path not in current_path.split(os.pathsep):
             full_path_extension = absolute_path + os.pathsep + current_path
             os.environ['PATH'] = full_path_extension
+            #print(f"Updated PATH with: ", full_path_extension)
         else:
             print(f"Directory {folder_path} already exists in PATH.")
     else:
         #if bucket_name:
         #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
+        return out_session_hash, output_folder, out_session_hash
     else:
         print("No session parameters found.")
         return "",""