Commit
·
01c88c0
1
Parent(s):
e08f9b8
Added logging, anonymising all Excel sheets, simple redaction tags, some Dockerfile optimisation
Browse files- .dockerignore +1 -0
- .gitignore +1 -0
- Dockerfile +26 -19
- app.py +40 -20
- requirements.txt +6 -6
- tools/data_anonymise.py +161 -67
- tools/file_conversion.py +23 -13
- tools/file_redaction.py +37 -11
- tools/helper_functions.py +36 -7
.dockerignore
CHANGED
|
@@ -14,4 +14,5 @@ poppler/*
|
|
| 14 |
build/*
|
| 15 |
dist/*
|
| 16 |
build_deps/*
|
|
|
|
| 17 |
doc_redaction_amplify_app/*
|
|
|
|
| 14 |
build/*
|
| 15 |
dist/*
|
| 16 |
build_deps/*
|
| 17 |
+
logs/*
|
| 18 |
doc_redaction_amplify_app/*
|
.gitignore
CHANGED
|
@@ -14,4 +14,5 @@ poppler/*
|
|
| 14 |
build/*
|
| 15 |
dist/*
|
| 16 |
build_deps/*
|
|
|
|
| 17 |
doc_redaction_amplify_app/*
|
|
|
|
| 14 |
build/*
|
| 15 |
dist/*
|
| 16 |
build_deps/*
|
| 17 |
+
logs/*
|
| 18 |
doc_redaction_amplify_app/*
|
Dockerfile
CHANGED
|
@@ -1,12 +1,8 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
# Install system dependencies. Need to specify -y for poppler to get it to install
|
| 4 |
RUN apt-get update \
|
| 5 |
-
&& apt-get install -y \
|
| 6 |
-
tesseract-ocr -y \
|
| 7 |
-
poppler-utils -y \
|
| 8 |
-
libgl1-mesa-glx -y \
|
| 9 |
-
libglib2.0-0 -y \
|
| 10 |
&& apt-get clean \
|
| 11 |
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
|
|
@@ -14,19 +10,34 @@ WORKDIR /src
|
|
| 14 |
|
| 15 |
COPY requirements.txt .
|
| 16 |
|
| 17 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Set up a new user named "user" with user ID 1000
|
| 22 |
RUN useradd -m -u 1000 user
|
| 23 |
|
| 24 |
-
# Change ownership of /home/user directory
|
| 25 |
-
#RUN chown -R user:user /home/user
|
| 26 |
-
|
| 27 |
# Make output folder
|
| 28 |
-
RUN mkdir -p /home/user/app/output
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# Switch to the "user" user
|
| 32 |
USER user
|
|
@@ -34,18 +45,15 @@ USER user
|
|
| 34 |
# Set environmental variables
|
| 35 |
ENV HOME=/home/user \
|
| 36 |
PATH=/home/user/.local/bin:$PATH \
|
| 37 |
-
PYTHONPATH
|
| 38 |
PYTHONUNBUFFERED=1 \
|
|
|
|
| 39 |
GRADIO_ALLOW_FLAGGING=never \
|
| 40 |
GRADIO_NUM_PORTS=1 \
|
| 41 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 42 |
GRADIO_SERVER_PORT=7860 \
|
| 43 |
GRADIO_THEME=huggingface \
|
| 44 |
TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
|
| 45 |
-
#GRADIO_TEMP_DIR=$HOME/tmp \
|
| 46 |
-
#GRADIO_ROOT_PATH=/address-match \
|
| 47 |
-
# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
|
| 48 |
-
KEEP_ALIVE=60 \
|
| 49 |
SYSTEM=spaces
|
| 50 |
|
| 51 |
# Set the working directory to the user's home directory
|
|
@@ -53,6 +61,5 @@ WORKDIR $HOME/app
|
|
| 53 |
|
| 54 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 55 |
COPY --chown=user . $HOME/app
|
| 56 |
-
#COPY . $HOME/app
|
| 57 |
|
| 58 |
CMD ["python", "app.py"]
|
|
|
|
| 1 |
+
# Stage 1: Build dependencies and download models
|
| 2 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
|
| 3 |
|
| 4 |
# Install system dependencies. Need to specify -y for poppler to get it to install
|
| 5 |
RUN apt-get update \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
&& apt-get clean \
|
| 7 |
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
|
|
|
|
| 10 |
|
| 11 |
COPY requirements.txt .
|
| 12 |
|
| 13 |
+
RUN pip install --no-cache-dir --target=/install -r requirements.txt
|
| 14 |
+
|
| 15 |
+
RUN rm requirements.txt
|
| 16 |
|
| 17 |
+
# Stage 2: Final runtime image
|
| 18 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
| 19 |
+
|
| 20 |
+
# Install system dependencies. Need to specify -y for poppler to get it to install
|
| 21 |
+
RUN apt-get update \
|
| 22 |
+
&& apt-get install -y \
|
| 23 |
+
tesseract-ocr \
|
| 24 |
+
poppler-utils \
|
| 25 |
+
libgl1-mesa-glx \
|
| 26 |
+
libglib2.0-0 \
|
| 27 |
+
&& apt-get clean \
|
| 28 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 29 |
|
| 30 |
# Set up a new user named "user" with user ID 1000
|
| 31 |
RUN useradd -m -u 1000 user
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
# Make output folder
|
| 34 |
+
RUN mkdir -p /home/user/app/output \
|
| 35 |
+
&& mkdir -p /home/user/app/tld \
|
| 36 |
+
&& mkdir -p /home/user/app/logs \
|
| 37 |
+
&& chown -R user:user /home/user/app
|
| 38 |
+
|
| 39 |
+
# Copy installed packages from builder stage
|
| 40 |
+
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
| 41 |
|
| 42 |
# Switch to the "user" user
|
| 43 |
USER user
|
|
|
|
| 45 |
# Set environmental variables
|
| 46 |
ENV HOME=/home/user \
|
| 47 |
PATH=/home/user/.local/bin:$PATH \
|
| 48 |
+
PYTHONPATH=/home/user/app \
|
| 49 |
PYTHONUNBUFFERED=1 \
|
| 50 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 51 |
GRADIO_ALLOW_FLAGGING=never \
|
| 52 |
GRADIO_NUM_PORTS=1 \
|
| 53 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 54 |
GRADIO_SERVER_PORT=7860 \
|
| 55 |
GRADIO_THEME=huggingface \
|
| 56 |
TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
SYSTEM=spaces
|
| 58 |
|
| 59 |
# Set the working directory to the user's home directory
|
|
|
|
| 61 |
|
| 62 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 63 |
COPY --chown=user . $HOME/app
|
|
|
|
| 64 |
|
| 65 |
CMD ["python", "app.py"]
|
app.py
CHANGED
|
@@ -6,7 +6,7 @@ os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
|
| 6 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
|
| 7 |
from tools.file_redaction import choose_and_run_redactor
|
| 8 |
from tools.file_conversion import prepare_image_or_text_pdf
|
| 9 |
-
from tools.data_anonymise import
|
| 10 |
from tools.auth import authenticate_user
|
| 11 |
#from tools.aws_functions import load_data_from_aws
|
| 12 |
import gradio as gr
|
|
@@ -28,6 +28,7 @@ with app:
|
|
| 28 |
prepared_pdf_state = gr.State([])
|
| 29 |
output_image_files_state = gr.State([])
|
| 30 |
output_file_list_state = gr.State([])
|
|
|
|
| 31 |
|
| 32 |
session_hash_state = gr.State()
|
| 33 |
s3_output_folder_state = gr.State()
|
|
@@ -51,7 +52,8 @@ with app:
|
|
| 51 |
|
| 52 |
with gr.Row():
|
| 53 |
output_summary = gr.Textbox(label="Output summary")
|
| 54 |
-
output_file = gr.File(label="Output
|
|
|
|
| 55 |
|
| 56 |
with gr.Row():
|
| 57 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
|
@@ -64,16 +66,19 @@ with app:
|
|
| 64 |
)
|
| 65 |
with gr.Accordion("Paste open text", open = False):
|
| 66 |
in_text = gr.Textbox(label="Enter open text", lines=10)
|
| 67 |
-
with gr.Accordion("Upload xlsx
|
| 68 |
-
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
in_colnames = gr.Dropdown(choices=["Choose
|
| 71 |
|
| 72 |
-
|
| 73 |
|
| 74 |
with gr.Row():
|
| 75 |
text_output_summary = gr.Textbox(label="Output result")
|
| 76 |
-
text_output_file = gr.File(label="Output
|
|
|
|
| 77 |
|
| 78 |
with gr.Tab(label="Redaction settings"):
|
| 79 |
gr.Markdown(
|
|
@@ -83,13 +88,16 @@ with app:
|
|
| 83 |
with gr.Accordion("Settings for documents", open = True):
|
| 84 |
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
|
| 85 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
| 86 |
-
anon_strat = gr.Radio(choices=["replace", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace")
|
| 87 |
|
| 88 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
| 89 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
| 90 |
with gr.Row():
|
| 91 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
| 92 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# AWS options - not yet implemented
|
| 95 |
# with gr.Tab(label="Advanced options"):
|
|
@@ -104,26 +112,38 @@ with app:
|
|
| 104 |
# ### Loading AWS data ###
|
| 105 |
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
| 106 |
|
|
|
|
| 107 |
|
| 108 |
# Document redaction
|
| 109 |
-
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
|
| 110 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
| 111 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
| 112 |
-
outputs=[output_summary, output_file, output_file_list_state], api_name="redact_doc")
|
| 113 |
-
#then(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
|
| 114 |
-
#outputs=[output_summary, output_file])
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
|
| 122 |
|
| 123 |
-
app.load(
|
|
|
|
| 124 |
|
| 125 |
# Launch the Gradio app
|
| 126 |
-
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '
|
| 127 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
| 128 |
|
| 129 |
if __name__ == "__main__":
|
|
|
|
| 6 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
|
| 7 |
from tools.file_redaction import choose_and_run_redactor
|
| 8 |
from tools.file_conversion import prepare_image_or_text_pdf
|
| 9 |
+
from tools.data_anonymise import anonymise_data_files
|
| 10 |
from tools.auth import authenticate_user
|
| 11 |
#from tools.aws_functions import load_data_from_aws
|
| 12 |
import gradio as gr
|
|
|
|
| 28 |
prepared_pdf_state = gr.State([])
|
| 29 |
output_image_files_state = gr.State([])
|
| 30 |
output_file_list_state = gr.State([])
|
| 31 |
+
text_output_file_list_state = gr.State([])
|
| 32 |
|
| 33 |
session_hash_state = gr.State()
|
| 34 |
s3_output_folder_state = gr.State()
|
|
|
|
| 52 |
|
| 53 |
with gr.Row():
|
| 54 |
output_summary = gr.Textbox(label="Output summary")
|
| 55 |
+
output_file = gr.File(label="Output files")
|
| 56 |
+
text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
|
| 57 |
|
| 58 |
with gr.Row():
|
| 59 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
|
|
|
| 66 |
)
|
| 67 |
with gr.Accordion("Paste open text", open = False):
|
| 68 |
in_text = gr.Textbox(label="Enter open text", lines=10)
|
| 69 |
+
with gr.Accordion("Upload xlsx or csv files", open = True):
|
| 70 |
+
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
| 71 |
+
|
| 72 |
+
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
|
| 73 |
|
| 74 |
+
in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
|
| 75 |
|
| 76 |
+
tabular_data_redact_btn = gr.Button("Anonymise text", variant="primary")
|
| 77 |
|
| 78 |
with gr.Row():
|
| 79 |
text_output_summary = gr.Textbox(label="Output result")
|
| 80 |
+
text_output_file = gr.File(label="Output files")
|
| 81 |
+
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
|
| 82 |
|
| 83 |
with gr.Tab(label="Redaction settings"):
|
| 84 |
gr.Markdown(
|
|
|
|
| 88 |
with gr.Accordion("Settings for documents", open = True):
|
| 89 |
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
|
| 90 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
| 91 |
+
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
| 92 |
|
| 93 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
| 94 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
| 95 |
with gr.Row():
|
| 96 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
| 97 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
|
| 98 |
+
|
| 99 |
+
# Invisible text box to hold the session hash/username just for logging purposes
|
| 100 |
+
session_hash_textbox = gr.Textbox(value="", visible=False)
|
| 101 |
|
| 102 |
# AWS options - not yet implemented
|
| 103 |
# with gr.Tab(label="Advanced options"):
|
|
|
|
| 112 |
# ### Loading AWS data ###
|
| 113 |
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
| 114 |
|
| 115 |
+
callback = gr.CSVLogger()
|
| 116 |
|
| 117 |
# Document redaction
|
| 118 |
+
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
|
| 119 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
| 120 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
|
| 121 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
# If the output file count text box changes, keep going with redacting each document until done
|
| 124 |
+
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
|
| 125 |
+
outputs=[output_summary, prepared_pdf_state]).\
|
| 126 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
|
| 127 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done])
|
| 128 |
+
|
| 129 |
+
# Tabular data redaction
|
| 130 |
+
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
|
| 131 |
+
|
| 132 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
|
| 133 |
+
|
| 134 |
+
# If the output file count text box changes, keep going with redacting each data file until done
|
| 135 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
|
| 136 |
+
|
| 137 |
+
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
| 138 |
|
| 139 |
+
# This needs to be called at some point prior to the first call to callback.flag()
|
| 140 |
+
callback.setup([session_hash_textbox], "logs")
|
|
|
|
| 141 |
|
| 142 |
+
#app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
| 143 |
+
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
| 144 |
|
| 145 |
# Launch the Gradio app
|
| 146 |
+
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
| 147 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
| 148 |
|
| 149 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
|
@@ -6,10 +6,10 @@ presidio_anonymizer==2.2.354
|
|
| 6 |
presidio-image-redactor==0.0.52
|
| 7 |
pikepdf==8.15.1
|
| 8 |
pandas==2.2.2
|
| 9 |
-
spacy
|
| 10 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
|
| 11 |
-
gradio
|
| 12 |
-
boto3==1.34.
|
| 13 |
-
|
| 14 |
-
openpyxl
|
| 15 |
-
|
|
|
|
| 6 |
presidio-image-redactor==0.0.52
|
| 7 |
pikepdf==8.15.1
|
| 8 |
pandas==2.2.2
|
| 9 |
+
spacy==3.7.5
|
| 10 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
|
| 11 |
+
gradio>=4.26.0
|
| 12 |
+
boto3==1.34.158
|
| 13 |
+
pyarrow==14.0.2
|
| 14 |
+
openpyxl==3.1.2
|
| 15 |
+
Faker==22.2.0
|
tools/data_anonymise.py
CHANGED
|
@@ -11,9 +11,9 @@ from typing import List
|
|
| 11 |
|
| 12 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
|
| 13 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 14 |
-
from presidio_anonymizer.entities import OperatorConfig
|
| 15 |
|
| 16 |
-
from tools.helper_functions import output_folder, get_file_path_end, read_file
|
| 17 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
| 18 |
|
| 19 |
# Use custom version of analyze_dict to be able to track progress
|
|
@@ -116,17 +116,20 @@ def anon_consistent_names(df):
|
|
| 116 |
|
| 117 |
return scrubbed_df_consistent_names
|
| 118 |
|
| 119 |
-
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str],
|
|
|
|
|
|
|
|
|
|
| 120 |
# DataFrame to dict
|
| 121 |
df_dict = df.to_dict(orient="list")
|
| 122 |
|
| 123 |
-
if
|
| 124 |
-
|
| 125 |
|
| 126 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
| 127 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
| 128 |
|
| 129 |
-
anonymizer = AnonymizerEngine()
|
| 130 |
|
| 131 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
| 132 |
|
|
@@ -134,19 +137,19 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
| 134 |
# entities=chosen_redact_entities,
|
| 135 |
# score_threshold=score_threshold,
|
| 136 |
# return_decision_process=False,
|
| 137 |
-
#
|
| 138 |
|
| 139 |
print("Identifying personal information")
|
| 140 |
analyse_tic = time.perf_counter()
|
| 141 |
|
| 142 |
-
print("Allow list:",
|
| 143 |
|
| 144 |
# Use custom analyzer to be able to track progress with Gradio
|
| 145 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
| 146 |
entities=chosen_redact_entities,
|
| 147 |
score_threshold=score_threshold,
|
| 148 |
return_decision_process=False,
|
| 149 |
-
allow_list=
|
| 150 |
analyzer_results = list(analyzer_results)
|
| 151 |
#analyzer_results
|
| 152 |
|
|
@@ -154,9 +157,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
| 154 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
| 155 |
print(analyse_time_out)
|
| 156 |
|
| 157 |
-
|
| 158 |
-
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
| 159 |
-
key_string = base64.b64encode(key).decode('utf-8')
|
| 160 |
|
| 161 |
# Create faker function (note that it has to receive a value)
|
| 162 |
|
|
@@ -166,6 +167,7 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
| 166 |
return fake.first_name()
|
| 167 |
|
| 168 |
# Set up the anonymization configuration WITHOUT DATE_TIME
|
|
|
|
| 169 |
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
| 170 |
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
| 171 |
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
|
@@ -173,12 +175,16 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
| 173 |
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
| 174 |
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
| 175 |
|
| 176 |
-
|
| 177 |
-
if anon_strat == "replace": chosen_mask_config = replace_config
|
| 178 |
if anon_strat == "redact": chosen_mask_config = redact_config
|
| 179 |
if anon_strat == "hash": chosen_mask_config = hash_config
|
| 180 |
if anon_strat == "mask": chosen_mask_config = mask_config
|
| 181 |
-
if anon_strat == "encrypt":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
| 183 |
|
| 184 |
# I think in general people will want to keep date / times
|
|
@@ -190,17 +196,10 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
| 190 |
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
| 191 |
|
| 192 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
| 193 |
-
|
| 194 |
-
# Create reporting message
|
| 195 |
-
out_message = "Successfully anonymised"
|
| 196 |
-
|
| 197 |
-
if anon_strat == "encrypt":
|
| 198 |
-
out_message = out_message + ". Your decryption key is " + key_string + "."
|
| 199 |
|
| 200 |
-
return scrubbed_df,
|
| 201 |
|
| 202 |
-
def
|
| 203 |
-
|
| 204 |
def check_lists(list1, list2):
|
| 205 |
return any(string in list2 for string in list1)
|
| 206 |
|
|
@@ -221,69 +220,164 @@ def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], la
|
|
| 221 |
common_strings.append(string)
|
| 222 |
return common_strings
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
# Load file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
anon_df = pd.DataFrame()
|
| 227 |
-
|
| 228 |
|
| 229 |
# Check if files and text exist
|
| 230 |
-
if not
|
| 231 |
if in_text:
|
| 232 |
-
|
| 233 |
else:
|
| 234 |
out_message = "Please enter text or a file to redact."
|
| 235 |
-
return out_message,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
|
|
|
|
|
|
|
| 238 |
|
| 239 |
-
if
|
| 240 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
| 241 |
chosen_cols=['text']
|
| 242 |
-
out_file_part =
|
| 243 |
else:
|
| 244 |
-
|
| 245 |
-
|
|
|
|
| 246 |
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
-
|
| 250 |
-
all_cols_original_order = list(anon_df.columns)
|
| 251 |
|
| 252 |
-
|
|
|
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
else:
|
| 259 |
-
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
| 260 |
|
| 261 |
-
# Split dataframe to keep only selected columns
|
| 262 |
-
print("Remaining columns to redact:", chosen_cols_in_anon_df)
|
| 263 |
-
|
| 264 |
-
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
| 265 |
-
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
| 266 |
-
|
| 267 |
-
# Anonymise the selected columns
|
| 268 |
-
anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, allow_list)
|
| 269 |
-
|
| 270 |
-
# Rejoin the dataframe together
|
| 271 |
-
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
| 272 |
-
anon_df_out = anon_df_out[all_cols_original_order]
|
| 273 |
-
|
| 274 |
-
# Export file
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
# out_file_part = re.sub(r'\.csv', '', match_file.name)
|
| 278 |
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
-
|
|
|
|
| 282 |
|
| 283 |
-
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
out_message = anon_df_out['text'][0]
|
| 288 |
|
| 289 |
-
return
|
|
|
|
| 11 |
|
| 12 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
|
| 13 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 14 |
+
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
| 15 |
|
| 16 |
+
from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
|
| 17 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
| 18 |
|
| 19 |
# Use custom version of analyze_dict to be able to track progress
|
|
|
|
| 116 |
|
| 117 |
return scrubbed_df_consistent_names
|
| 118 |
|
| 119 |
+
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
|
| 120 |
+
|
| 121 |
+
key_string = ""
|
| 122 |
+
|
| 123 |
# DataFrame to dict
|
| 124 |
df_dict = df.to_dict(orient="list")
|
| 125 |
|
| 126 |
+
if in_allow_list:
|
| 127 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 128 |
|
| 129 |
#analyzer = nlp_analyser #AnalyzerEngine()
|
| 130 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
| 131 |
|
| 132 |
+
anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
|
| 133 |
|
| 134 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
| 135 |
|
|
|
|
| 137 |
# entities=chosen_redact_entities,
|
| 138 |
# score_threshold=score_threshold,
|
| 139 |
# return_decision_process=False,
|
| 140 |
+
# in_allow_list=in_allow_list_flat)
|
| 141 |
|
| 142 |
print("Identifying personal information")
|
| 143 |
analyse_tic = time.perf_counter()
|
| 144 |
|
| 145 |
+
print("Allow list:", in_allow_list)
|
| 146 |
|
| 147 |
# Use custom analyzer to be able to track progress with Gradio
|
| 148 |
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
| 149 |
entities=chosen_redact_entities,
|
| 150 |
score_threshold=score_threshold,
|
| 151 |
return_decision_process=False,
|
| 152 |
+
allow_list=in_allow_list_flat)
|
| 153 |
analyzer_results = list(analyzer_results)
|
| 154 |
#analyzer_results
|
| 155 |
|
|
|
|
| 157 |
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
| 158 |
print(analyse_time_out)
|
| 159 |
|
| 160 |
+
|
|
|
|
|
|
|
| 161 |
|
| 162 |
# Create faker function (note that it has to receive a value)
|
| 163 |
|
|
|
|
| 167 |
return fake.first_name()
|
| 168 |
|
| 169 |
# Set up the anonymization configuration WITHOUT DATE_TIME
|
| 170 |
+
simple_replace_config = eval('{"DEFAULT": OperatorConfig("replace", {"new_value": "REDACTED"})}')
|
| 171 |
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
| 172 |
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
| 173 |
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
|
|
|
| 175 |
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
| 176 |
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
| 177 |
|
| 178 |
+
if anon_strat == "replace with <REDACTED>": chosen_mask_config = simple_replace_config
|
| 179 |
+
if anon_strat == "replace with <ENTITY_NAME>": chosen_mask_config = replace_config
|
| 180 |
if anon_strat == "redact": chosen_mask_config = redact_config
|
| 181 |
if anon_strat == "hash": chosen_mask_config = hash_config
|
| 182 |
if anon_strat == "mask": chosen_mask_config = mask_config
|
| 183 |
+
if anon_strat == "encrypt":
|
| 184 |
+
chosen_mask_config = people_encrypt_config
|
| 185 |
+
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
| 186 |
+
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
| 187 |
+
key_string = base64.b64encode(key).decode('utf-8')
|
| 188 |
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
| 189 |
|
| 190 |
# I think in general people will want to keep date / times
|
|
|
|
| 196 |
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
| 197 |
|
| 198 |
scrubbed_df = pd.DataFrame(anonymizer_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
+
return scrubbed_df, key_string
|
| 201 |
|
| 202 |
+
def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
|
|
|
|
| 203 |
def check_lists(list1, list2):
|
| 204 |
return any(string in list2 for string in list1)
|
| 205 |
|
|
|
|
| 220 |
common_strings.append(string)
|
| 221 |
return common_strings
|
| 222 |
|
| 223 |
+
# Check for chosen col, skip file if not found
|
| 224 |
+
all_cols_original_order = list(anon_df.columns)
|
| 225 |
+
|
| 226 |
+
any_cols_found = check_lists(chosen_cols, all_cols_original_order)
|
| 227 |
+
|
| 228 |
+
if any_cols_found == False:
|
| 229 |
+
out_message = "No chosen columns found in dataframe: " + out_file_part
|
| 230 |
+
print(out_message)
|
| 231 |
+
else:
|
| 232 |
+
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
| 233 |
+
|
| 234 |
+
# Split dataframe to keep only selected columns
|
| 235 |
+
print("Remaining columns to redact:", chosen_cols_in_anon_df)
|
| 236 |
+
|
| 237 |
+
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
| 238 |
+
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
| 239 |
+
|
| 240 |
+
# Anonymise the selected columns
|
| 241 |
+
anon_df_part_out, key_string = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list)
|
| 242 |
+
|
| 243 |
+
# Rejoin the dataframe together
|
| 244 |
+
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
| 245 |
+
anon_df_out = anon_df_out[all_cols_original_order]
|
| 246 |
+
|
| 247 |
+
# Export file
|
| 248 |
+
|
| 249 |
+
# Rename anonymisation strategy for file path naming
|
| 250 |
+
if anon_strat == "replace with <REDACTED>": anon_strat_txt = "redact_simple"
|
| 251 |
+
elif anon_strat == "replace with <ENTITY_NAME>": anon_strat_txt = "redact_entity_type"
|
| 252 |
+
else: anon_strat_txt = anon_strat
|
| 253 |
+
|
| 254 |
+
# If the file is an xlsx, add a new sheet to the existing xlsx. Otherwise, write to csv
|
| 255 |
+
if file_type == 'xlsx':
|
| 256 |
+
|
| 257 |
+
anon_export_file_name = anon_xlsx_export_file_name
|
| 258 |
+
|
| 259 |
+
# Create a Pandas Excel writer using XlsxWriter as the engine.
|
| 260 |
+
with pd.ExcelWriter(anon_xlsx_export_file_name, engine='openpyxl', mode='a') as writer:
|
| 261 |
+
# Write each DataFrame to a different worksheet.
|
| 262 |
+
anon_df_out.to_excel(writer, sheet_name=excel_sheet_name, index=None)
|
| 263 |
+
|
| 264 |
+
else:
|
| 265 |
+
anon_export_file_name = output_folder + out_file_part + "_" + excel_sheet_name + "_anon_" + anon_strat_txt + ".csv"
|
| 266 |
+
anon_df_out.to_csv(anon_export_file_name, index = None)
|
| 267 |
+
|
| 268 |
+
out_file_paths.append(anon_export_file_name)
|
| 269 |
+
|
| 270 |
+
# As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
|
| 271 |
+
out_file_paths = list(set(out_file_paths))
|
| 272 |
+
|
| 273 |
+
# Print result text to output text box if just anonymising open text
|
| 274 |
+
if anon_file=='open_text':
|
| 275 |
+
out_message = [anon_df_out['text'][0]]
|
| 276 |
+
|
| 277 |
+
return out_file_paths, out_message, key_string
|
| 278 |
+
|
| 279 |
+
def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], progress=Progress(track_tqdm=True)):
|
| 280 |
+
|
| 281 |
+
tic = time.perf_counter()
|
| 282 |
+
|
| 283 |
# Load file
|
| 284 |
+
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
| 285 |
+
if isinstance(out_message, str):
|
| 286 |
+
out_message = [out_message]
|
| 287 |
+
|
| 288 |
+
if not out_file_paths:
|
| 289 |
+
out_file_paths = []
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
if in_allow_list:
|
| 293 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 294 |
|
| 295 |
anon_df = pd.DataFrame()
|
| 296 |
+
#out_file_paths = []
|
| 297 |
|
| 298 |
# Check if files and text exist
|
| 299 |
+
if not file_paths:
|
| 300 |
if in_text:
|
| 301 |
+
file_paths=['open_text']
|
| 302 |
else:
|
| 303 |
out_message = "Please enter text or a file to redact."
|
| 304 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
| 305 |
+
|
| 306 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
| 307 |
+
if latest_file_completed == len(file_paths):
|
| 308 |
+
print("Last file reached, returning files:", str(latest_file_completed))
|
| 309 |
+
final_out_message = '\n'.join(out_message)
|
| 310 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed
|
| 311 |
|
| 312 |
+
file_path_loop = [file_paths[int(latest_file_completed)]]
|
| 313 |
+
|
| 314 |
+
for anon_file in progress.tqdm(file_path_loop, desc="Anonymising files", unit = "file"):
|
| 315 |
|
| 316 |
+
if anon_file=='open_text':
|
| 317 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
| 318 |
chosen_cols=['text']
|
| 319 |
+
out_file_part = anon_file
|
| 320 |
else:
|
| 321 |
+
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
| 322 |
+
file_type = detect_file_type(anon_file)
|
| 323 |
+
print("File type is:", file_type)
|
| 324 |
|
| 325 |
+
out_file_part = get_file_path_end(anon_file.name)
|
| 326 |
+
|
| 327 |
+
if file_type == 'xlsx':
|
| 328 |
+
print("Running through all xlsx sheets")
|
| 329 |
+
#anon_xlsx = pd.ExcelFile(anon_file)
|
| 330 |
+
if not in_excel_sheets:
|
| 331 |
+
out_message.append("No Excel sheets selected. Please select at least one to anonymise.")
|
| 332 |
+
continue
|
| 333 |
|
| 334 |
+
anon_xlsx = pd.ExcelFile(anon_file)
|
|
|
|
| 335 |
|
| 336 |
+
# Create xlsx file:
|
| 337 |
+
anon_xlsx_export_file_name = output_folder + out_file_part + ".xlsx"
|
| 338 |
|
| 339 |
+
from openpyxl import Workbook
|
| 340 |
+
|
| 341 |
+
wb = Workbook()
|
| 342 |
+
wb.save(anon_xlsx_export_file_name)
|
|
|
|
|
|
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
+
# Iterate through the sheet names
|
| 346 |
+
for sheet_name in in_excel_sheets:
|
| 347 |
+
# Read each sheet into a DataFrame
|
| 348 |
+
if sheet_name not in anon_xlsx.sheet_names:
|
| 349 |
+
continue
|
| 350 |
+
|
| 351 |
+
anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
|
| 352 |
+
|
| 353 |
+
# Process the DataFrame (e.g., print its contents)
|
| 354 |
+
print(f"Sheet Name: {sheet_name}")
|
| 355 |
+
print(anon_df.head()) # Print the first few rows
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name)
|
| 359 |
+
|
| 360 |
+
else:
|
| 361 |
+
sheet_name = ""
|
| 362 |
+
anon_df = read_file(anon_file)
|
| 363 |
+
out_file_part = get_file_path_end(anon_file.name)
|
| 364 |
+
out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "")
|
| 365 |
+
|
| 366 |
+
# Increase latest file completed count unless we are at the last file
|
| 367 |
+
if latest_file_completed != len(file_paths):
|
| 368 |
+
print("Completed file number:", str(latest_file_completed))
|
| 369 |
+
latest_file_completed += 1
|
| 370 |
+
|
| 371 |
+
toc = time.perf_counter()
|
| 372 |
+
out_time = f"in {toc - tic:0.1f} seconds."
|
| 373 |
+
print(out_time)
|
| 374 |
|
| 375 |
+
if anon_strat == "encrypt":
|
| 376 |
+
out_message.append(". Your decryption key is " + key_string + ".")
|
| 377 |
|
| 378 |
+
out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
|
| 379 |
|
| 380 |
+
out_message_out = '\n'.join(out_message)
|
| 381 |
+
out_message_out = out_message_out + " " + out_time
|
|
|
|
| 382 |
|
| 383 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed
|
tools/file_conversion.py
CHANGED
|
@@ -89,15 +89,31 @@ def process_file(file_path):
|
|
| 89 |
|
| 90 |
return img_object
|
| 91 |
|
| 92 |
-
def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 98 |
|
|
|
|
|
|
|
| 99 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
| 100 |
-
for file in
|
| 101 |
file_path = file.name
|
| 102 |
|
| 103 |
#if file_path:
|
|
@@ -112,7 +128,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
|
|
| 112 |
if is_pdf_or_image(file_path) == False:
|
| 113 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
| 114 |
print(out_message)
|
| 115 |
-
return out_message,
|
| 116 |
|
| 117 |
out_file_path = process_file(file_path)
|
| 118 |
print("Out file path at image conversion step:", out_file_path)
|
|
@@ -121,7 +137,7 @@ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_all
|
|
| 121 |
if is_pdf(file_path) == False:
|
| 122 |
out_message = "Please upload a PDF file for text analysis."
|
| 123 |
print(out_message)
|
| 124 |
-
return out_message,
|
| 125 |
|
| 126 |
out_file_path = file_path
|
| 127 |
|
|
@@ -151,10 +167,4 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
| 151 |
|
| 152 |
print("Out file paths:", out_file_paths)
|
| 153 |
|
| 154 |
-
return out_message, out_file_paths
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
| 89 |
|
| 90 |
return img_object
|
| 91 |
|
| 92 |
+
def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], progress=Progress(track_tqdm=True)):
|
| 93 |
|
| 94 |
+
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
| 95 |
+
#if isinstance(out_message, str):
|
| 96 |
+
# out_message = [out_message]
|
| 97 |
+
|
| 98 |
+
if not file_paths:
|
| 99 |
+
file_paths = []
|
| 100 |
+
|
| 101 |
+
out_file_paths = file_paths
|
| 102 |
+
|
| 103 |
+
latest_file_completed = int(latest_file_completed)
|
| 104 |
+
|
| 105 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
| 106 |
+
if latest_file_completed == len(out_file_paths):
|
| 107 |
+
print("Last file reached, returning files:", str(latest_file_completed))
|
| 108 |
+
#final_out_message = '\n'.join(out_message)
|
| 109 |
+
return out_message, out_file_paths
|
| 110 |
|
| 111 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 112 |
|
| 113 |
+
file_paths_loop = [out_file_paths[int(latest_file_completed)]]
|
| 114 |
+
|
| 115 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
| 116 |
+
for file in file_paths_loop:
|
| 117 |
file_path = file.name
|
| 118 |
|
| 119 |
#if file_path:
|
|
|
|
| 128 |
if is_pdf_or_image(file_path) == False:
|
| 129 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
| 130 |
print(out_message)
|
| 131 |
+
return out_message, out_file_paths
|
| 132 |
|
| 133 |
out_file_path = process_file(file_path)
|
| 134 |
print("Out file path at image conversion step:", out_file_path)
|
|
|
|
| 137 |
if is_pdf(file_path) == False:
|
| 138 |
out_message = "Please upload a PDF file for text analysis."
|
| 139 |
print(out_message)
|
| 140 |
+
return out_message, out_file_paths
|
| 141 |
|
| 142 |
out_file_path = file_path
|
| 143 |
|
|
|
|
| 167 |
|
| 168 |
print("Out file paths:", out_file_paths)
|
| 169 |
|
| 170 |
+
return out_message, out_file_paths
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/file_redaction.py
CHANGED
|
@@ -17,20 +17,36 @@ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_
|
|
| 17 |
import gradio as gr
|
| 18 |
|
| 19 |
|
| 20 |
-
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
| 21 |
|
| 22 |
tic = time.perf_counter()
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
if in_allow_list:
|
| 28 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 29 |
-
|
| 30 |
|
| 31 |
print("File paths:", file_paths)
|
| 32 |
|
| 33 |
-
for file in progress.tqdm(
|
| 34 |
file_path = file.name
|
| 35 |
|
| 36 |
if file_path:
|
|
@@ -42,7 +58,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
| 42 |
else:
|
| 43 |
out_message = "No file selected"
|
| 44 |
print(out_message)
|
| 45 |
-
return out_message, out_file_paths
|
| 46 |
|
| 47 |
if in_redact_method == "Image analysis":
|
| 48 |
# Analyse and redact image-based pdf or image
|
|
@@ -57,6 +73,11 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
| 57 |
out_file_paths.append(out_image_file_path)
|
| 58 |
out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
elif in_redact_method == "Text analysis":
|
| 61 |
if is_pdf(file_path) == False:
|
| 62 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
|
@@ -81,21 +102,26 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
| 81 |
out_file_paths.extend(img_output_file_path)
|
| 82 |
|
| 83 |
# Add confirmation for converting to image if you want
|
| 84 |
-
# out_message.append(img_output_summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
else:
|
| 87 |
out_message = "No redaction method selected"
|
| 88 |
print(out_message)
|
| 89 |
-
return out_message, out_file_paths
|
|
|
|
| 90 |
|
| 91 |
toc = time.perf_counter()
|
| 92 |
-
out_time = f"
|
| 93 |
print(out_time)
|
| 94 |
|
| 95 |
out_message_out = '\n'.join(out_message)
|
| 96 |
-
out_message_out = out_message_out + "
|
| 97 |
|
| 98 |
-
return out_message_out, out_file_paths, out_file_paths
|
| 99 |
|
| 100 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
| 101 |
merged_bboxes = []
|
|
|
|
| 17 |
import gradio as gr
|
| 18 |
|
| 19 |
|
| 20 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):
|
| 21 |
|
| 22 |
tic = time.perf_counter()
|
| 23 |
|
| 24 |
+
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
| 25 |
+
if isinstance(out_message, str):
|
| 26 |
+
out_message = [out_message]
|
| 27 |
+
|
| 28 |
+
if not out_file_paths:
|
| 29 |
+
out_file_paths = []
|
| 30 |
+
|
| 31 |
+
print("Latest file completed is:", str(latest_file_completed))
|
| 32 |
+
|
| 33 |
+
latest_file_completed = int(latest_file_completed)
|
| 34 |
+
|
| 35 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
| 36 |
+
if latest_file_completed == len(file_paths):
|
| 37 |
+
print("Last file reached, returning files:", str(latest_file_completed))
|
| 38 |
+
final_out_message = '\n'.join(out_message)
|
| 39 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed
|
| 40 |
+
|
| 41 |
+
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
| 42 |
|
| 43 |
if in_allow_list:
|
| 44 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 45 |
+
|
| 46 |
|
| 47 |
print("File paths:", file_paths)
|
| 48 |
|
| 49 |
+
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
| 50 |
file_path = file.name
|
| 51 |
|
| 52 |
if file_path:
|
|
|
|
| 58 |
else:
|
| 59 |
out_message = "No file selected"
|
| 60 |
print(out_message)
|
| 61 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
| 62 |
|
| 63 |
if in_redact_method == "Image analysis":
|
| 64 |
# Analyse and redact image-based pdf or image
|
|
|
|
| 73 |
out_file_paths.append(out_image_file_path)
|
| 74 |
out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
|
| 75 |
|
| 76 |
+
# Increase latest file completed count unless we are at the last file
|
| 77 |
+
if latest_file_completed != len(file_paths):
|
| 78 |
+
print("Completed file number:", str(latest_file_completed))
|
| 79 |
+
latest_file_completed += 1
|
| 80 |
+
|
| 81 |
elif in_redact_method == "Text analysis":
|
| 82 |
if is_pdf(file_path) == False:
|
| 83 |
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
|
|
|
| 102 |
out_file_paths.extend(img_output_file_path)
|
| 103 |
|
| 104 |
# Add confirmation for converting to image if you want
|
| 105 |
+
# out_message.append(img_output_summary)
|
| 106 |
+
|
| 107 |
+
if latest_file_completed != len(file_paths):
|
| 108 |
+
print("Completed file number:", str(latest_file_completed))
|
| 109 |
+
latest_file_completed += 1
|
| 110 |
|
| 111 |
else:
|
| 112 |
out_message = "No redaction method selected"
|
| 113 |
print(out_message)
|
| 114 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
| 115 |
+
|
| 116 |
|
| 117 |
toc = time.perf_counter()
|
| 118 |
+
out_time = f"in {toc - tic:0.1f} seconds."
|
| 119 |
print(out_time)
|
| 120 |
|
| 121 |
out_message_out = '\n'.join(out_message)
|
| 122 |
+
out_message_out = out_message_out + " " + out_time
|
| 123 |
|
| 124 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed
|
| 125 |
|
| 126 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
| 127 |
merged_bboxes = []
|
tools/helper_functions.py
CHANGED
|
@@ -76,17 +76,46 @@ def ensure_output_folder_exists():
|
|
| 76 |
def put_columns_in_df(in_file):
|
| 77 |
new_choices = []
|
| 78 |
concat_choices = []
|
|
|
|
|
|
|
| 79 |
|
| 80 |
for file in in_file:
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
|
|
|
|
|
|
| 86 |
# Drop duplicate columns
|
| 87 |
concat_choices = list(set(concat_choices))
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
| 92 |
def add_folder_to_path(folder_path: str):
|
|
@@ -104,7 +133,7 @@ def add_folder_to_path(folder_path: str):
|
|
| 104 |
if absolute_path not in current_path.split(os.pathsep):
|
| 105 |
full_path_extension = absolute_path + os.pathsep + current_path
|
| 106 |
os.environ['PATH'] = full_path_extension
|
| 107 |
-
print(f"Updated PATH with: ", full_path_extension)
|
| 108 |
else:
|
| 109 |
print(f"Directory {folder_path} already exists in PATH.")
|
| 110 |
else:
|
|
@@ -167,7 +196,7 @@ async def get_connection_params(request: gr.Request):
|
|
| 167 |
#if bucket_name:
|
| 168 |
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
| 169 |
|
| 170 |
-
return out_session_hash, output_folder
|
| 171 |
else:
|
| 172 |
print("No session parameters found.")
|
| 173 |
return "",""
|
|
|
|
| 76 |
def put_columns_in_df(in_file):
|
| 77 |
new_choices = []
|
| 78 |
concat_choices = []
|
| 79 |
+
all_sheet_names = []
|
| 80 |
+
number_of_excel_files = 0
|
| 81 |
|
| 82 |
for file in in_file:
|
| 83 |
+
file_name = file.name
|
| 84 |
+
file_type = detect_file_type(file_name)
|
| 85 |
+
print("File type is:", file_type)
|
| 86 |
|
| 87 |
+
if file_type == 'xlsx':
|
| 88 |
+
number_of_excel_files += 1
|
| 89 |
+
new_choices = []
|
| 90 |
+
print("Running through all xlsx sheets")
|
| 91 |
+
anon_xlsx = pd.ExcelFile(file_name)
|
| 92 |
+
new_sheet_names = anon_xlsx.sheet_names
|
| 93 |
+
# Iterate through the sheet names
|
| 94 |
+
for sheet_name in new_sheet_names:
|
| 95 |
+
# Read each sheet into a DataFrame
|
| 96 |
+
df = pd.read_excel(file_name, sheet_name=sheet_name)
|
| 97 |
+
|
| 98 |
+
# Process the DataFrame (e.g., print its contents)
|
| 99 |
+
print(f"Sheet Name: {sheet_name}")
|
| 100 |
+
print(df.head()) # Print the first few rows
|
| 101 |
+
|
| 102 |
+
new_choices.extend(list(df.columns))
|
| 103 |
+
|
| 104 |
+
all_sheet_names.extend(new_sheet_names)
|
| 105 |
+
|
| 106 |
+
else:
|
| 107 |
+
df = read_file(file_name)
|
| 108 |
+
new_choices = list(df.columns)
|
| 109 |
|
| 110 |
+
concat_choices.extend(new_choices)
|
| 111 |
+
|
| 112 |
# Drop duplicate columns
|
| 113 |
concat_choices = list(set(concat_choices))
|
| 114 |
+
|
| 115 |
+
if number_of_excel_files > 0:
|
| 116 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names, visible=True)
|
| 117 |
+
else:
|
| 118 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices), gr.Dropdown(visible=False)
|
| 119 |
|
| 120 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
| 121 |
def add_folder_to_path(folder_path: str):
|
|
|
|
| 133 |
if absolute_path not in current_path.split(os.pathsep):
|
| 134 |
full_path_extension = absolute_path + os.pathsep + current_path
|
| 135 |
os.environ['PATH'] = full_path_extension
|
| 136 |
+
#print(f"Updated PATH with: ", full_path_extension)
|
| 137 |
else:
|
| 138 |
print(f"Directory {folder_path} already exists in PATH.")
|
| 139 |
else:
|
|
|
|
| 196 |
#if bucket_name:
|
| 197 |
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
| 198 |
|
| 199 |
+
return out_session_hash, output_folder, out_session_hash
|
| 200 |
else:
|
| 201 |
print("No session parameters found.")
|
| 202 |
return "",""
|