seanpedrickcase commited on
Commit
641ff3e
·
0 Parent(s):

Initial commit

Browse files
.dockerignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.csv
2
+ *.pdf
3
+ *.url
4
+ *.jpg
5
+ *.png
6
+ *.ipynb
7
+ examples/*
8
+ processing/*
9
+ output/*
10
+ tools/__pycache__/*
11
+ old_code/*
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/[email protected]
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://seanpedrickcase:[email protected]/spaces/seanpedrickcase/document_redaction main
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.csv
2
+ *.pdf
3
+ *.url
4
+ *.jpg
5
+ *.png
6
+ *.ipynb
7
+ examples/*
8
+ processing/*
9
+ output/*
10
+ tools/__pycache__/*
11
+ old_code/*
Dockerfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update \
5
+ && apt-get install -y \
6
+ tesseract-ocr \
7
+ libtesseract-dev \
8
+ poppler-utils \
9
+ && apt-get clean \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ WORKDIR /src
13
+
14
+ COPY requirements.txt .
15
+
16
+ RUN pip install -r requirements.txt
17
+
18
+ # Set up a new user named "user" with user ID 1000
19
+ #RUN useradd -m -u 1000 user
20
+
21
+ # Change ownership of /home/user directory
22
+ #RUN chown -R user:user /home/user
23
+
24
+ # Create the temp files directory and set its permissions
25
+ #RUN mkdir -p /home/user/tmp && chown -R user:user /home/user/tmp
26
+
27
+ # Switch to the "user" user
28
+ #USER user
29
+
30
+ # Set home to the user's home directory
31
+ ENV HOME=/home/user \
32
+ PATH=/home/user/.local/bin:$PATH \
33
+ PYTHONPATH=$HOME/app \
34
+ PYTHONUNBUFFERED=1 \
35
+ GRADIO_ALLOW_FLAGGING=never \
36
+ GRADIO_NUM_PORTS=1 \
37
+ GRADIO_SERVER_NAME=0.0.0.0 \
38
+ GRADIO_SERVER_PORT=7861 \
39
+ GRADIO_THEME=huggingface \
40
+ #GRADIO_TEMP_DIR=$HOME/tmp \
41
+ #GRADIO_ROOT_PATH=/address-match \
42
+ SYSTEM=spaces
43
+
44
+ # Set the working directory to the user's home directory
45
+ WORKDIR $HOME/app
46
+
47
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
48
+ #COPY --chown=user . $HOME/app
49
+ COPY . $HOME/app
50
+
51
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Document redaction
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.27.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Introduction
14
+ Redact PDF files using image-based OCR or direct text analysis from pdfminer.six. Personal information identification performed using Microsoft Presidio.
15
+
16
+ Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
17
+
18
+ WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
19
+
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tools.file_redaction import redact_text_pdf, redact_image_pdf
2
+ from tools.helper_functions import get_file_path_end
3
+ from tools.file_conversion import process_file, is_pdf
4
+ from tools.aws_functions import load_data_from_aws
5
+
6
+ from typing import List
7
+ import pandas as pd
8
+ import gradio as gr
9
+ import time
10
+
11
+ file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
12
+
13
+ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
14
+ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
15
+ language = 'en'
16
+
17
+ def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
18
+
19
+ tic = time.perf_counter()
20
+
21
+ if is_pdf(file_path) == False:
22
+ return "Please upload a PDF file.", None
23
+
24
+ out_message = ''
25
+ out_file_paths = []
26
+
27
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
28
+
29
+ if file_path:
30
+ file_path_without_ext = get_file_path_end(file_path)
31
+ else:
32
+ out_message = "No file selected"
33
+ print(out_message)
34
+ return out_message, out_file_paths
35
+
36
+ if in_redact_method == "Image analysis":
37
+ # Analyse image-based pdf
38
+ pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
39
+ out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
40
+ pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
41
+
42
+ out_file_paths.append(out_image_file_path)
43
+ out_message = "Image-based PDF successfully redacted and saved to file."
44
+
45
+ elif in_redact_method == "Text analysis":
46
+ # Analyse text-based pdf
47
+ pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
48
+ out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
49
+ pdf_text.save(out_text_file_path)
50
+
51
+ out_file_paths.append(out_text_file_path)
52
+
53
+ # Convert annotated text pdf back to image to give genuine redactions
54
+ pdf_text_image_paths = process_file(out_text_file_path)
55
+ out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
56
+ pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
57
+
58
+ out_file_paths.append(out_text_image_file_path)
59
+
60
+ out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
61
+
62
+ else:
63
+ out_message = "No redaction method selected"
64
+ print(out_message)
65
+ return out_message, out_file_paths
66
+
67
+ toc = time.perf_counter()
68
+ out_time = f"Time taken: {toc - tic:0.1f} seconds."
69
+ print(out_time)
70
+
71
+ out_message = out_message + "\n\n" + out_time
72
+
73
+ return out_message, out_file_paths
74
+
75
+
76
+ # Create the gradio interface
77
+
78
+ block = gr.Blocks(theme = gr.themes.Base())
79
+
80
+ with block:
81
+
82
+ data_state = gr.State(pd.DataFrame())
83
+ ref_data_state = gr.State(pd.DataFrame())
84
+ results_data_state = gr.State(pd.DataFrame())
85
+ ref_results_data_state =gr.State(pd.DataFrame())
86
+
87
+ gr.Markdown(
88
+ """
89
+ # Document redaction
90
+ Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
91
+
92
+ WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
93
+ """)
94
+
95
+ with gr.Tab("Redact document"):
96
+
97
+ with gr.Accordion("Input document", open = True):
98
+ in_file = gr.File(label="Choose document file", file_count= "single")
99
+ in_redaction_method = gr.Radio(label="Redaction method", value = "Image analysis", choices=["Image analysis", "Text analysis"])
100
+ in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
101
+ in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
102
+ in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
103
+
104
+ redact_btn = gr.Button("Redact document")
105
+
106
+ with gr.Row():
107
+ output_summary = gr.Textbox(label="Output summary")
108
+ output_file = gr.File(label="Output file")
109
+
110
+ with gr.Tab(label="Advanced options"):
111
+ with gr.Accordion(label = "AWS data access", open = False):
112
+ aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
113
+ with gr.Row():
114
+ in_aws_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
115
+ load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
116
+
117
+ aws_log_box = gr.Textbox(label="AWS data load status")
118
+
119
+
120
+ ### Loading AWS data ###
121
+ load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
122
+
123
+
124
+ # Updates to components
125
+ #in_file.change(fn = initial_data_load, inputs=[in_file], outputs=[output_summary, in_redact_entities, in_existing, data_state, results_data_state])
126
+ #in_ref.change(fn = initial_data_load, inputs=[in_ref], outputs=[output_summary, in_refcol, in_joincol, ref_data_state, ref_results_data_state])
127
+
128
+ redact_btn.click(fn = choose_and_run_redactor, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
129
+ outputs=[output_summary, output_file], api_name="redact")
130
+
131
+ # Simple run for HF spaces or local on your computer
132
+ #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
133
+
134
+ # Simple run for AWS server
135
+ block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
136
+
137
+ # Download OpenSSL from here:
138
+ # Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
139
+ #block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
140
+ # ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
141
+
142
+ # Running on local server without https
143
+ #block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pdfminer.six==20231228
2
+ pdf2image==1.17.0
3
+ #img2pdf==0.5.1
4
+ presidio_analyzer==2.2.351
5
+ presidio_anonymizer==2.2.351
6
+ presidio-image-redactor==0.0.52
7
+ pikepdf==8.15.1
8
+ pandas==2.2.2
9
+ spacy==3.7.4
10
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
11
+ gradio==4.27.0
tools/__init__.py ADDED
File without changes
tools/aws_functions.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type
2
+ import pandas as pd
3
+ import boto3
4
+ import tempfile
5
+ import os
6
+
7
+ PandasDataFrame = Type[pd.DataFrame]
8
+
9
+ bucket_name = 'doc-redaction-data'
10
+
11
+ try:
12
+ session = boto3.Session(profile_name="default")
13
+ except Exception as e:
14
+ print(e)
15
+
16
+ # sts = session.client("sts")
17
+ # Create a Session with the IAM role ARN
18
+ # aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
19
+ # response = sts.assume_role(
20
+ # RoleArn=aws_role,
21
+ # RoleSessionName="ecs-test-session"
22
+ # )
23
+ # print(response)
24
+
25
+
26
+ def get_assumed_role_info():
27
+ sts = boto3.client('sts')
28
+ response = sts.get_caller_identity()
29
+
30
+ # Extract ARN of the assumed role
31
+ assumed_role_arn = response['Arn']
32
+
33
+ # Extract the name of the assumed role from the ARN
34
+ assumed_role_name = assumed_role_arn.split('/')[-1]
35
+
36
+ return assumed_role_arn, assumed_role_name
37
+
38
+ try:
39
+ assumed_role_arn, assumed_role_name = get_assumed_role_info()
40
+
41
+ print("Assumed Role ARN:", assumed_role_arn)
42
+ print("Assumed Role Name:", assumed_role_name)
43
+ except Exception as e:
44
+ print(e)
45
+
46
+ # Download direct from S3 - requires login credentials
47
+ def download_file_from_s3(bucket_name, key, local_file_path):
48
+
49
+ s3 = boto3.client('s3')
50
+ s3.download_file(bucket_name, key, local_file_path)
51
+ print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
52
+
53
+ #download_file_from_s3(bucket_name, object_key, local_file_loc)
54
+
55
+ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
56
+ """
57
+ Download all files from an S3 folder to a local folder.
58
+ """
59
+ s3 = boto3.client('s3')
60
+
61
+ # List objects in the specified S3 folder
62
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
63
+
64
+ # Download each object
65
+ for obj in response.get('Contents', []):
66
+ # Extract object key and construct local file path
67
+ object_key = obj['Key']
68
+ local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
69
+
70
+ # Create directories if necessary
71
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
72
+
73
+ # Download the object
74
+ try:
75
+ s3.download_file(bucket_name, object_key, local_file_path)
76
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
77
+ except Exception as e:
78
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
79
+
80
+
81
+ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
82
+ """
83
+ Download specific files from an S3 folder to a local folder.
84
+ """
85
+ s3 = boto3.client('s3')
86
+
87
+ print("Trying to download file: ", filenames)
88
+
89
+ if filenames == '*':
90
+ # List all objects in the S3 folder
91
+ print("Trying to download all files in AWS folder: ", s3_folder)
92
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
93
+
94
+ print("Found files in AWS folder: ", response.get('Contents', []))
95
+
96
+ filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
97
+
98
+ print("Found filenames in AWS folder: ", filenames)
99
+
100
+ for filename in filenames:
101
+ object_key = os.path.join(s3_folder, filename)
102
+ local_file_path = os.path.join(local_folder, filename)
103
+
104
+ # Create directories if necessary
105
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
106
+
107
+ # Download the object
108
+ try:
109
+ s3.download_file(bucket_name, object_key, local_file_path)
110
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
111
+ except Exception as e:
112
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
113
+
114
+
115
+
116
+ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
117
+
118
+ temp_dir = tempfile.mkdtemp()
119
+ local_address_stub = temp_dir + '/doc-redaction/'
120
+ files = []
121
+
122
+ if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
123
+ out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
124
+ return files, out_message
125
+
126
+ if aws_password:
127
+ if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
128
+
129
+ s3_folder_stub = 'example-data/lambeth-borough-plan/latest/'
130
+
131
+ local_folder_path = local_address_stub
132
+
133
+ # Check if folder exists
134
+ if not os.path.exists(local_folder_path):
135
+ print(f"Folder {local_folder_path} does not exist! Making folder.")
136
+
137
+ os.mkdir(local_folder_path)
138
+
139
+ # Check if folder is empty
140
+ if len(os.listdir(local_folder_path)) == 0:
141
+ print(f"Folder {local_folder_path} is empty")
142
+ # Download data
143
+ download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
144
+
145
+ print("AWS data downloaded")
146
+
147
+ else:
148
+ print(f"Folder {local_folder_path} is not empty")
149
+
150
+ #files = os.listdir(local_folder_stub)
151
+ #print(files)
152
+
153
+ files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
154
+
155
+ out_message = "Data successfully loaded from AWS"
156
+ print(out_message)
157
+
158
+ else:
159
+ out_message = "Data not loaded from AWS"
160
+ print(out_message)
161
+ else:
162
+ out_message = "No password provided. Please ask the data team for access if you need this."
163
+ print(out_message)
164
+
165
+ return files, out_message
tools/file_conversion.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf2image import convert_from_path
2
+ import os
3
+
4
+ def is_pdf(filename):
5
+ """
6
+ Check if a file name is a PDF.
7
+
8
+ Args:
9
+ filename (str): The name of the file.
10
+
11
+ Returns:
12
+ bool: True if the file name ends with ".pdf", False otherwise.
13
+ """
14
+ return filename.lower().endswith(".pdf")
15
+
16
+ # %%
17
+ ## Convert pdf to image if necessary
18
+
19
+ def convert_pdf_to_images(pdf_path):
20
+
21
+ image_paths = []
22
+
23
+ # Convert PDF to a list of images
24
+ images = convert_from_path(pdf_path)
25
+
26
+ # Save each image as a separate file
27
+ # for i, image in enumerate(images):
28
+ # page_path = f"processing/page_{i+1}.png"
29
+ # image.save(page_path, "PNG")
30
+ # image_paths.append(page_path)
31
+
32
+ print("PDF has been converted to images.")
33
+
34
+ return images
35
+
36
+ # %%
37
+ def process_file(file_path):
38
+ # Get the file extension
39
+ file_extension = os.path.splitext(file_path)[1].lower()
40
+
41
+ # Check if the file is an image type
42
+ if file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
43
+ print(f"{file_path} is an image file.")
44
+ # Perform image processing here
45
+ out_path = [file_path]
46
+
47
+ # Check if the file is a PDF
48
+ elif file_extension == '.pdf':
49
+ print(f"{file_path} is a PDF file. Converting to image set")
50
+ # Run your function for processing PDF files here
51
+ out_path = convert_pdf_to_images(file_path)
52
+
53
+ else:
54
+ print(f"{file_path} is not an image or PDF file.")
55
+ out_path = ['']
56
+
57
+ return out_path
58
+
tools/file_redaction.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from typing import List
3
+ import pandas as pd
4
+ from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
5
+ from pdfminer.high_level import extract_pages
6
+ from tools.file_conversion import process_file
7
+ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
8
+ from pikepdf import Pdf, Dictionary, Name
9
+ from gradio import Progress
10
+
11
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
12
+
13
+ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
14
+ '''
15
+ take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
16
+ '''
17
+
18
+ progress(0, desc="Converting pages to image")
19
+
20
+ image_paths = process_file(file_path)
21
+
22
+ # Create a new PDF
23
+ #pdf = pikepdf.new()
24
+
25
+ images = []
26
+ number_of_pages = len(image_paths)
27
+
28
+ progress(0.1, desc="Redacting pages")
29
+
30
+ for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
31
+
32
+ # Get the image to redact using PIL lib (pillow)
33
+ image = image_paths[i] #Image.open(image_paths[i])
34
+
35
+ # %%
36
+ image_analyser = ImageAnalyzerEngine(nlp_analyser)
37
+ engine = ImageRedactorEngine(image_analyser)
38
+
39
+
40
+ if language == 'en':
41
+ ocr_lang = 'eng'
42
+ else: ocr_lang = language
43
+
44
+ # %%
45
+ # Redact the image with pink color
46
+ redacted_image = engine.redact(image,
47
+ fill=(0, 0, 0),
48
+ ocr_kwargs={"lang": ocr_lang},
49
+ allow_list=allow_list,
50
+ ad_hoc_recognizers= None,
51
+ **{
52
+ "language": language,
53
+ "entities": chosen_redact_entities,
54
+ "score_threshold": score_threshold
55
+ },
56
+ )
57
+
58
+ images.append(redacted_image)
59
+
60
+ # multiple inputs (variant 2)
61
+ # with open("name.pdf","wb") as f:
62
+ # f.write(img2pdf.convert(["test1.jpg", "test2.png"]))
63
+
64
+ # # Create page from image
65
+ # pdf.add_blank_page(page_size=(redacted_image.width, redacted_image.height))
66
+ # page = pdf.pages[-1]
67
+ # page.add_image(redacted_image, 0, 0)
68
+
69
+ # %%
70
+ # Get descriptive output of results for checks - not necessary except for debugging
71
+ # bboxes = image_analyser.analyze(image)
72
+
73
+ # # %%
74
+ # check_df = pd.DataFrame(bboxes)[0].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
75
+
76
+ # check_df.columns = ["type", "start", "end", "score", "left", "top", "width", "height"]
77
+
78
+ # check_df.to_csv("check_df.csv")
79
+
80
+ return images
81
+
82
+ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress()):
83
+ '''
84
+ Redact chosen entities from a pdf that is made up of multiple pages that are not images.
85
+ '''
86
+
87
+ combined_analyzer_results = []
88
+ analyser_explanations = []
89
+ annotations_all_pages = []
90
+ analyzed_bounding_boxes_df = pd.DataFrame()
91
+
92
+ pdf = Pdf.open(filename)
93
+
94
+ for page_num, page in progress.tqdm(enumerate(pdf.pages), total=len(pdf.pages), unit="pages", desc="Redacting pages"):
95
+
96
+ print("Page number is: ", page_num)
97
+
98
+ annotations_on_page = []
99
+ analyzed_bounding_boxes = []
100
+
101
+ for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
102
+ analyzer_results = []
103
+
104
+ for text_container in page_layout:
105
+ if isinstance(text_container, LTTextContainer):
106
+ text_to_analyze = text_container.get_text()
107
+
108
+ analyzer_results = []
109
+ characters = []
110
+
111
+ analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
112
+ language=language,
113
+ entities=chosen_redact_entities,
114
+ score_threshold=score_threshold,
115
+ return_decision_process=False,
116
+ allow_list=allow_list)
117
+
118
+ #if analyzer_results:
119
+ # pass
120
+ #explanation = analyzer_results[0].analysis_explanation.to_dict()
121
+ #analyser_explanations.append(explanation)
122
+ characters = [char # This is what we want to include in the list
123
+ for line in text_container # Loop through each line in text_container
124
+ if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
125
+ for char in line] # Loop through each character in the line
126
+ #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
127
+
128
+
129
+ #print(characters)
130
+
131
+ # Collect unique types
132
+ # unique_types = set()
133
+
134
+ # for line in text_container:
135
+ # if isinstance(line, LTTextLine):
136
+ # print("Line: ", line)
137
+ # for char in line:
138
+ # unique_types.add(type(char))
139
+ # if isinstance(char, LTAnno):
140
+ # print(char)
141
+
142
+ # # Print the unique types
143
+ # print("Unique types in text_container:")
144
+ # for t in unique_types:
145
+ # print(t)
146
+
147
+ # If any results found
148
+ print(analyzer_results)
149
+
150
+ if len(analyzer_results) > 0 and len(characters) > 0:
151
+ analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
152
+ combined_analyzer_results.extend(analyzer_results)
153
+
154
+ if len(analyzer_results) > 0:
155
+ # Create summary df of annotations to be made
156
+ analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
157
+ analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
158
+ analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
159
+ analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
160
+ analyzed_bounding_boxes_df_new['page'] = page_num + 1
161
+ analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
162
+
163
+ for analyzed_bounding_box in analyzed_bounding_boxes:
164
+ bounding_box = analyzed_bounding_box["boundingBox"]
165
+ annotation = Dictionary(
166
+ Type=Name.Annot,
167
+ Subtype=Name.Highlight,
168
+ QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
169
+ Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
170
+ C=[0, 0, 0],
171
+ CA=1, # Transparency
172
+ T=analyzed_bounding_box["result"].entity_type
173
+ )
174
+ annotations_on_page.append(annotation)
175
+
176
+ annotations_all_pages.extend([annotations_on_page])
177
+
178
+ print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
179
+ page.Annots = pdf.make_indirect(annotations_on_page)
180
+
181
+ # Extracting data from dictionaries
182
+ # extracted_data = []
183
+ # for item in annotations_all_pages:
184
+ # temp_dict = {}
185
+ # #print(item)
186
+ # for key, value in item.items():
187
+ # if isinstance(value, Decimal):
188
+ # temp_dict[key] = float(value)
189
+ # elif isinstance(value, list):
190
+ # temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
191
+ # else:
192
+ # temp_dict[key] = value
193
+ # extracted_data.append(temp_dict)
194
+
195
+ # Creating DataFrame
196
+ # annotations_out = pd.DataFrame(extracted_data)
197
+ #print(df)
198
+
199
+ #annotations_out.to_csv("examples/annotations.csv")
200
+
201
+ analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
202
+
203
+ return pdf
tools/helper_functions.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def get_file_path_end(file_path):
4
+ # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
5
+ basename = os.path.basename(file_path)
6
+
7
+ # Then, split the basename and its extension and return only the basename without the extension
8
+ filename_without_extension, _ = os.path.splitext(basename)
9
+
10
+ #print(filename_without_extension)
11
+
12
+ return filename_without_extension
tools/load_spacy_model_custom_recognisers.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from typing import List
3
+ from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
+ from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
+ import spacy
6
+ import re
7
+
8
+ # %%
9
+ model_name = "en_core_web_lg" #"en_core_web_trf"
10
+ score_threshold = 0.001
11
+
12
+ # %% [markdown]
13
+ # #### Custom recognisers
14
+
15
+ # %%
16
+ # Custom title recogniser
17
+ import re
18
+ titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
19
+ titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
20
+ titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
21
+ titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
22
+
23
+ # %%
24
+ # Custom postcode recogniser
25
+
26
+ # Define the regex pattern in a Presidio `Pattern` object:
27
+ ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b", score = 1)
28
+
29
+ # Define the recognizer with one or more patterns
30
+ ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
31
+
32
+ # %%
33
+ # Examples for testing
34
+
35
+ #text = "I live in 510 Broad st SE5 9NG ."
36
+
37
+ #numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
38
+ #print("Result:")
39
+ #print(numbers_result)
40
+
41
+ # %%
42
+ def extract_street_name(text:str) -> str:
43
+ """
44
+ Extracts the street name and preceding word (that should contain at least one number) from the given text.
45
+
46
+ """
47
+
48
+ street_types = [
49
+ 'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
50
+ 'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
51
+ 'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
52
+ 'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
53
+ 'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
54
+ 'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
55
+ 'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
56
+ 'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
57
+ ]
58
+
59
+ # Dynamically construct the regex pattern with all possible street types
60
+ street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
61
+
62
+ # The overall regex pattern to capture the street name and preceding word(s)
63
+
64
+ pattern = rf'(?P<preceding_word>\w*\d\w*)\s*'
65
+ pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
66
+
67
+ # Find all matches in text
68
+ matches = re.finditer(pattern, text, re.IGNORECASE)
69
+
70
+ start_positions = []
71
+ end_positions = []
72
+
73
+ for match in matches:
74
+ preceding_word = match.group('preceding_word').strip()
75
+ street_name = match.group('street_name').strip()
76
+ start_pos = match.start()
77
+ end_pos = match.end()
78
+ print(f"Start: {start_pos}, End: {end_pos}")
79
+ print(f"Preceding words: {preceding_word}")
80
+ print(f"Street name: {street_name}")
81
+ print()
82
+
83
+ start_positions.append(start_pos)
84
+ end_positions.append(end_pos)
85
+
86
+ return start_positions, end_positions
87
+
88
+
89
+ # %%
90
+ # Some examples for testing
91
+
92
+ #text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
93
+ #text = "Roberto lives in Five 10 Broad st in Oregon"
94
+ #text = "Roberto lives in 55 Oregon Square"
95
+ #text = "There is 51a no way I will do that"
96
+ #text = "I am writing to apply for"
97
+
98
+ #extract_street_name(text)
99
+
100
+ # %%
101
+ class StreetNameRecognizer(EntityRecognizer):
102
+
103
+ def load(self) -> None:
104
+ """No loading is required."""
105
+ pass
106
+
107
+ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
108
+ """
109
+ Logic for detecting a specific PII
110
+ """
111
+
112
+ start_pos, end_pos = extract_street_name(text)
113
+
114
+ results = []
115
+
116
+ for i in range(0, len(start_pos)):
117
+
118
+ result = RecognizerResult(
119
+ entity_type="STREETNAME",
120
+ start = start_pos[i],
121
+ end = end_pos[i],
122
+ score= 1
123
+ )
124
+
125
+ results.append(result)
126
+
127
+ return results
128
+
129
+ street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
130
+
131
+ # %%
132
+ # Create a class inheriting from SpacyNlpEngine
133
+ class LoadedSpacyNlpEngine(SpacyNlpEngine):
134
+ def __init__(self, loaded_spacy_model):
135
+ super().__init__()
136
+ self.nlp = {"en": loaded_spacy_model}
137
+
138
+ # %%
139
+ # Load a model a-priori
140
+ nlp = spacy.load(model_name)
141
+
142
+ # Pass the loaded model to the new LoadedSpacyNlpEngine
143
+ loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
144
+
145
+
146
+
147
+ # %%
148
+ nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
149
+ default_score_threshold=score_threshold,
150
+ supported_languages=["en"],
151
+ log_decision_process=True,
152
+ )
153
+
154
+ # %%
155
+ nlp_analyser.registry.add_recognizer(street_recogniser)
156
+ nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
157
+ nlp_analyser.registry.add_recognizer(titles_recogniser)
158
+