File size: 10,272 Bytes
dce6100 bc22fc4 0f18146 7810536 01c88c0 bc22fc4 7810536 641ff3e bc22fc4 2a4b347 641ff3e bc22fc4 641ff3e bc22fc4 641ff3e 0f18146 2807627 01c88c0 0f18146 bc22fc4 641ff3e 7810536 37d982e 641ff3e 7810536 641ff3e 01c88c0 641ff3e 0f18146 7810536 641ff3e 7810536 01c88c0 7810536 01c88c0 7810536 01c88c0 7810536 01c88c0 7810536 01c88c0 7810536 01c88c0 7810536 641ff3e 7810536 01c88c0 7810536 01c88c0 0f18146 01c88c0 641ff3e 01c88c0 641ff3e 01c88c0 641ff3e 01c88c0 641ff3e 7810536 01c88c0 bc22fc4 7810536 bc22fc4 e08f9b8 bc22fc4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var
from tools.file_redaction import choose_and_run_redactor
from tools.file_conversion import prepare_image_or_text_pdf
from tools.data_anonymise import anonymise_data_files
from tools.auth import authenticate_user
#from tools.aws_functions import load_data_from_aws
import gradio as gr
add_folder_to_path("tesseract/")
add_folder_to_path("poppler/poppler-24.02.0/Library/bin/")
ensure_output_folder_exists()
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
language = 'en'
# Create the gradio interface
app = gr.Blocks(theme = gr.themes.Base())
with app:
prepared_pdf_state = gr.State([])
output_image_files_state = gr.State([])
output_file_list_state = gr.State([])
text_output_file_list_state = gr.State([])
session_hash_state = gr.State()
s3_output_folder_state = gr.State()
gr.Markdown(
"""
# Document redaction
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
""")
with gr.Tab("PDFs/images"):
with gr.Accordion("Redact document", open = True):
in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png'])
redact_btn = gr.Button("Redact document(s)", variant="primary")
with gr.Row():
output_summary = gr.Textbox(label="Output summary")
output_file = gr.File(label="Output files")
text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
with gr.Row():
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
with gr.Tab(label="Open text or Excel/csv files"):
gr.Markdown(
"""
### Choose open text or a tabular data file (xlsx or csv) to redact.
"""
)
with gr.Accordion("Paste open text", open = False):
in_text = gr.Textbox(label="Enter open text", lines=10)
with gr.Accordion("Upload xlsx or csv files", open = True):
in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
tabular_data_redact_btn = gr.Button("Anonymise text", variant="primary")
with gr.Row():
text_output_summary = gr.Textbox(label="Output result")
text_output_file = gr.File(label="Output files")
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
with gr.Tab(label="Redaction settings"):
gr.Markdown(
"""
Define redaction settings that affect both document and open text redaction.
""")
with gr.Accordion("Settings for documents", open = True):
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
with gr.Row():
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
# Invisible text box to hold the session hash/username just for logging purposes
session_hash_textbox = gr.Textbox(value="", visible=False)
# AWS options - not yet implemented
# with gr.Tab(label="Advanced options"):
# with gr.Accordion(label = "AWS data access", open = True):
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
# with gr.Row():
# in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
# load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
# aws_log_box = gr.Textbox(label="AWS data load status")
# ### Loading AWS data ###
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
callback = gr.CSVLogger()
# Document redaction
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
# If the output file count text box changes, keep going with redacting each document until done
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary],
outputs=[output_summary, prepared_pdf_state]).\
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state],
outputs=[output_summary, output_file, output_file_list_state, text_documents_done])
# Tabular data redaction
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
# If the output file count text box changes, keep going with redacting each data file until done
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done])
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
# This needs to be called at some point prior to the first call to callback.flag()
callback.setup([session_hash_textbox], "logs")
#app.load(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
# Launch the Gradio app
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
if __name__ == "__main__":
if os.environ['COGNITO_AUTH'] == "1":
app.queue().launch(show_error=True, auth=authenticate_user)
else:
app.queue().launch(show_error=True, inbrowser=True) |