seanpedrickcase commited on
Commit
aa5c211
·
1 Parent(s): bcf1a65

Fix to tabular redaction, added tabular deduplication. Updated cli call capability for both

Browse files
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
  # Stage 1: Build dependencies and download models
2
- FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
3
 
4
  # Install system dependencies
5
  RUN apt-get update \
@@ -24,7 +24,7 @@ COPY lambda_entrypoint.py .
24
  COPY entrypoint.sh .
25
 
26
  # Stage 2: Final runtime image
27
- FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
28
 
29
  # Set build-time and runtime environment variable
30
  ARG APP_MODE=gradio
 
1
  # Stage 1: Build dependencies and download models
2
+ FROM public.ecr.aws/docker/library/python:3.11.13-slim-bookworm AS builder
3
 
4
  # Install system dependencies
5
  RUN apt-get update \
 
24
  COPY entrypoint.sh .
25
 
26
  # Stage 2: Final runtime image
27
+ FROM public.ecr.aws/docker/library/python:3.11.13-slim-bookworm
28
 
29
  # Set build-time and runtime environment variable
30
  ARG APP_MODE=gradio
README.md CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 1.0.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), Word files (.docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 1.1.0
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), Word files (.docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
cli_redact.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import pandas as pd
4
+ from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, INPUT_FOLDER, OUTPUT_FOLDER, DEFAULT_LANGUAGE, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST
5
+ from tools.helper_functions import ensure_output_folder_exists
6
+ from tools.file_conversion import prepare_image_or_pdf
7
+ from tools.file_redaction import choose_and_run_redactor
8
+ from tools.data_anonymise import anonymise_files_with_open_text
9
+ from tools.helper_functions import _get_env_list
10
+ from tools.load_spacy_model_custom_recognisers import custom_entities
11
+ from tools.find_duplicate_pages import run_duplicate_analysis, run_full_search_and_analysis
12
+ from tools.find_duplicate_tabular import run_tabular_duplicate_analysis
13
+
14
+ # --- Constants and Configuration ---
15
+
16
+ if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
17
+ if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = _get_env_list(FULL_COMPREHEND_ENTITY_LIST)
18
+ if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = _get_env_list(CHOSEN_REDACT_ENTITIES)
19
+ if FULL_ENTITY_LIST: FULL_ENTITY_LIST = _get_env_list(FULL_ENTITY_LIST)
20
+
21
+ # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
22
+ CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
23
+ FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
24
+
25
+ chosen_redact_entities = CHOSEN_REDACT_ENTITIES
26
+ full_entity_list = FULL_ENTITY_LIST
27
+ chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES
28
+ full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST
29
+
30
+ # --- Main CLI Function ---
31
+ def main(direct_mode_args=None):
32
+ """
33
+ A unified command-line interface to prepare, redact, and anonymise various document types.
34
+
35
+ Args:
36
+ direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution.
37
+ If provided, uses these instead of parsing command line arguments.
38
+ """
39
+ parser = argparse.ArgumentParser(
40
+ description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
41
+ formatter_class=argparse.RawTextHelpFormatter,
42
+ epilog='''
43
+ Examples:
44
+ # Redact a PDF with default settings:
45
+ python cli_redact.py --input_file document.pdf
46
+
47
+ # Redact specific pages with custom OCR:
48
+ python cli_redact.py --input_file document.pdf --page_min 1 --page_max 10 --ocr_method "AWS Textract service - all PDF types"
49
+
50
+ # Anonymize Excel file with specific columns:
51
+ python cli_redact.py --input_file data.xlsx --columns "Name" "Email" --anon_strat "replace with 'REDACTED'"
52
+
53
+ # Use AWS services with custom settings:
54
+ python cli_redact.py --input_file document.pdf --pii_detector "AWS Comprehend" --aws_access_key YOUR_KEY --aws_secret_key YOUR_SECRET
55
+
56
+ # Advanced redaction with custom word list:
57
+ python cli_redact.py --input_file document.pdf --in_deny_list "CompanyName" "ProjectCode" --deny_list custom_terms.csv
58
+
59
+ # Find duplicate pages in OCR files:
60
+ python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --similarity_threshold 0.95
61
+
62
+ # Find duplicate content with search query:
63
+ python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --search_query "confidential information"
64
+
65
+ # Find duplicate rows in tabular data:
66
+ python cli_redact.py --task deduplicate --input_file data.csv --duplicate_type tabular --text_columns "Name" "Description"
67
+ '''
68
+ )
69
+
70
+ # --- Task Selection ---
71
+ task_group = parser.add_argument_group('Task Selection')
72
+ task_group.add_argument('--task',
73
+ choices=['redact', 'deduplicate'],
74
+ default='redact',
75
+ help='Task to perform: redact (PII redaction/anonymization) or deduplicate (find duplicate content).')
76
+
77
+ # --- General Arguments (apply to all file types) ---
78
+ general_group = parser.add_argument_group('General Options')
79
+ general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
80
+ general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
81
+ general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
82
+ general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
83
+ general_group.add_argument('--pii_detector',
84
+ choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
85
+ default=LOCAL_PII_OPTION,
86
+ help='Core PII detection method (Local or AWS).')
87
+ general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
88
+ general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
89
+ general_group.add_argument('--aws_region', default='', help='AWS region for cloud services.')
90
+ general_group.add_argument('--s3_bucket', default='', help='S3 bucket name for cloud operations.')
91
+ general_group.add_argument('--do_initial_clean', action='store_true', help='Perform initial text cleaning for tabular data.')
92
+ general_group.add_argument('--save_logs_to_csv', action='store_true', help='Save processing logs to CSV files.')
93
+ general_group.add_argument('--display_file_names_in_logs', action='store_true', help='Include file names in log outputs.')
94
+
95
+ # --- PDF/Image Redaction Arguments ---
96
+ pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
97
+ pdf_group.add_argument('--ocr_method',
98
+ choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
99
+ default=TESSERACT_TEXT_EXTRACT_OPTION,
100
+ help='OCR method for text extraction from images.')
101
+ pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
102
+ pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
103
+ pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
104
+ pdf_group.add_argument('--prepare_images', action='store_true', default=True, help='Enable image creation for PDF pages.')
105
+ pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
106
+ pdf_group.add_argument('--images_dpi', type=float, default=300.0, help='DPI for image processing.')
107
+ pdf_group.add_argument('--max_image_pixels', type=int, help='Maximum image pixels for processing.')
108
+ pdf_group.add_argument('--load_truncated_images', action='store_true', help='Load truncated images during processing.')
109
+ pdf_group.add_argument('--chosen_local_ocr_model', choices=['tesseract', 'hybrid', 'paddle'], default='tesseract', help='Local OCR model to use.')
110
+ pdf_group.add_argument('--preprocess_local_ocr_images', action='store_true', help='Preprocess images before OCR.')
111
+ pdf_group.add_argument('--compress_redacted_pdf', action='store_true', help='Compress the final redacted PDF.')
112
+ pdf_group.add_argument('--return_pdf_end_of_redaction', action='store_true', default=True, help='Return PDF at end of redaction process.')
113
+ pdf_group.add_argument('--in_deny_list', nargs='+', default=list(), help='Custom words to recognize for redaction.')
114
+ pdf_group.add_argument('--redact_whole_page_list', nargs='+', default=list(), help='Pages to redact completely.')
115
+ pdf_group.add_argument('--handwrite_signature_checkbox', nargs='+', default=['Extract handwriting', 'Extract signatures'], help='Handwriting and signature extraction options.')
116
+
117
+ # --- Word/Tabular Anonymisation Arguments ---
118
+ tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
119
+ tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash', 'replace with \'REDACTED\'', 'replace with <ENTITY_NAME>', 'redact completely', 'mask', 'fake_first_name'], default='redact', help='The anonymisation strategy to apply.')
120
+ tabular_group.add_argument('--columns', nargs='+', default=list(), help='A list of column names to anonymise in tabular data.')
121
+ tabular_group.add_argument('--excel_sheets', nargs='+', default=list(), help='Specific Excel sheet names to process.')
122
+ tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
123
+ tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
124
+
125
+ # --- Duplicate Detection Arguments ---
126
+ duplicate_group = parser.add_argument_group('Duplicate Detection Options')
127
+ duplicate_group.add_argument('--duplicate_type', choices=['pages', 'tabular'], default='pages', help='Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).')
128
+ duplicate_group.add_argument('--similarity_threshold', type=float, default=0.95, help='Similarity threshold (0-1) to consider content as duplicates.')
129
+ duplicate_group.add_argument('--min_word_count', type=int, default=3, help='Minimum word count for text to be considered in duplicate analysis.')
130
+ duplicate_group.add_argument('--min_consecutive_pages', type=int, default=1, help='Minimum number of consecutive pages to consider as a match.')
131
+ duplicate_group.add_argument('--greedy_match', action='store_true', default=True, help='Use greedy matching strategy for consecutive pages.')
132
+ duplicate_group.add_argument('--combine_pages', action='store_true', default=True, help='Combine text from the same page number within a file.')
133
+ duplicate_group.add_argument('--search_query', help='Search query text to find specific duplicate content (for page duplicates).')
134
+ duplicate_group.add_argument('--text_columns', nargs='+', default=list(), help='Specific text columns to analyze for duplicates (for tabular data).')
135
+
136
+ # Parse arguments - either from command line or direct mode
137
+ if direct_mode_args:
138
+ # Use direct mode arguments
139
+ args = argparse.Namespace(**direct_mode_args)
140
+ else:
141
+ # Parse command line arguments
142
+ args = parser.parse_args()
143
+
144
+ # --- Initial Setup ---
145
+ ensure_output_folder_exists(args.output_dir)
146
+ _, file_extension = os.path.splitext(args.input_file)
147
+ file_extension = file_extension.lower()
148
+
149
+ # Load allow/deny lists
150
+ allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
151
+ deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
152
+
153
+ # --- Route to the Correct Workflow Based on Task and File Type ---
154
+
155
+ # Task 1: Redaction/Anonymization
156
+ if args.task == 'redact':
157
+ # Workflow 1: PDF/Image Redaction
158
+ if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
159
+ print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
160
+ try:
161
+ # Step 1: Prepare the document
162
+ print("\nStep 1: Preparing document...")
163
+ (
164
+ prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
165
+ image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
166
+ ) = prepare_image_or_pdf(
167
+ file_paths=[args.input_file], text_extract_method=args.ocr_method,
168
+ all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
169
+ first_loop_state=True, prepare_for_review=args.prepare_for_review,
170
+ output_folder=args.output_dir, prepare_images=args.prepare_images
171
+ )
172
+ print(f"Preparation complete. {prep_summary}")
173
+
174
+ # Step 2: Redact the prepared document
175
+ print("\nStep 2: Running redaction...")
176
+ (
177
+ output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
178
+ ) = choose_and_run_redactor(
179
+ file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
180
+ pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
181
+ chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
182
+ in_allow_list=allow_list, in_deny_list=args.in_deny_list,
183
+ redact_whole_page_list=args.redact_whole_page_list, first_loop_state=True,
184
+ page_min=args.page_min, page_max=args.page_max, handwrite_signature_checkbox=args.handwrite_signature_checkbox,
185
+ pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
186
+ document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
187
+ aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
188
+ language=args.language, output_folder=args.output_dir
189
+ )
190
+
191
+ print("\n--- Redaction Process Complete ---")
192
+ print(f"Summary: {output_summary}")
193
+ print(f"\nOutput files saved to: {args.output_dir}")
194
+ print("Generated Files:", sorted(output_files))
195
+ if log_files: print("Log Files:", sorted(log_files))
196
+
197
+ except Exception as e:
198
+ print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
199
+
200
+ # Workflow 2: Word/Tabular Data Anonymisation
201
+ elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
202
+ print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
203
+ try:
204
+ # Run the anonymisation function directly
205
+ output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
206
+ file_paths=[args.input_file],
207
+ in_text="", # Not used for file-based operations
208
+ anon_strat=args.anon_strat,
209
+ chosen_cols=args.columns,
210
+ chosen_redact_entities=chosen_redact_entities,
211
+ in_allow_list=allow_list,
212
+ in_excel_sheets=args.excel_sheets,
213
+ first_loop_state=True,
214
+ output_folder=args.output_dir,
215
+ in_deny_list=deny_list,
216
+ max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
217
+ pii_identification_method=args.pii_detector,
218
+ chosen_redact_comprehend_entities=chosen_comprehend_entities,
219
+ aws_access_key_textbox=args.aws_access_key,
220
+ aws_secret_key_textbox=args.aws_secret_key,
221
+ language=args.language,
222
+ do_initial_clean=args.do_initial_clean
223
+ )
224
+
225
+ print("\n--- Anonymisation Process Complete ---")
226
+ print(f"Summary: {output_summary}")
227
+ print(f"\nOutput files saved to: {args.output_dir}")
228
+ print("Generated Files:", sorted(output_files))
229
+ if log_files: print("Log Files:", sorted(log_files))
230
+
231
+ except Exception as e:
232
+ print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
233
+
234
+ else:
235
+ print(f"Error: Unsupported file type '{file_extension}' for redaction.")
236
+ print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
237
+ print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
238
+
239
+ # Task 2: Duplicate Detection
240
+ elif args.task == 'deduplicate':
241
+ print("--- Starting Duplicate Detection Workflow... ---")
242
+ try:
243
+ if args.duplicate_type == 'pages':
244
+ # Page duplicate detection
245
+ if file_extension == '.csv':
246
+ print("--- Detected OCR CSV file. Starting Page Duplicate Detection... ---")
247
+
248
+ if args.search_query:
249
+ # Use search-based duplicate detection
250
+ print(f"Searching for duplicates of: '{args.search_query}'")
251
+ # Note: This would require the OCR data to be loaded first
252
+ # For now, we'll use the general duplicate analysis
253
+ print("Note: Search-based duplicate detection requires OCR data preparation.")
254
+ print("Using general duplicate analysis instead.")
255
+
256
+ # Load the CSV file as a list for the duplicate analysis function
257
+ results_df, output_paths, full_data_by_file = run_duplicate_analysis(
258
+ files=[args.input_file],
259
+ threshold=args.similarity_threshold,
260
+ min_words=args.min_word_count,
261
+ min_consecutive=args.min_consecutive_pages,
262
+ greedy_match=args.greedy_match,
263
+ combine_pages=args.combine_pages
264
+ )
265
+
266
+ print("\n--- Page Duplicate Detection Complete ---")
267
+ print(f"Found {len(results_df)} duplicate matches")
268
+ print(f"\nOutput files saved to: {args.output_dir}")
269
+ if output_paths: print("Generated Files:", sorted(output_paths))
270
+
271
+ else:
272
+ print(f"Error: Page duplicate detection requires CSV files with OCR data.")
273
+ print("Please provide a CSV file containing OCR output data.")
274
+
275
+ elif args.duplicate_type == 'tabular':
276
+ # Tabular duplicate detection
277
+ if file_extension in ['.csv', '.xlsx', '.xls', '.parquet']:
278
+ print("--- Detected tabular file. Starting Tabular Duplicate Detection... ---")
279
+
280
+ results_df, output_paths, full_data_by_file = run_tabular_duplicate_analysis(
281
+ files=[args.input_file],
282
+ threshold=args.similarity_threshold,
283
+ min_words=args.min_word_count,
284
+ text_columns=args.text_columns if args.text_columns else None,
285
+ output_folder=args.output_dir
286
+ )
287
+
288
+ print("\n--- Tabular Duplicate Detection Complete ---")
289
+ print(f"Found {len(results_df)} duplicate matches")
290
+ print(f"\nOutput files saved to: {args.output_dir}")
291
+ if output_paths: print("Generated Files:", sorted(output_paths))
292
+
293
+ else:
294
+ print(f"Error: Tabular duplicate detection requires CSV, Excel, or Parquet files.")
295
+ print("Supported types: .csv, .xlsx, .xls, .parquet")
296
+ else:
297
+ print(f"Error: Invalid duplicate type '{args.duplicate_type}'.")
298
+ print("Valid options: 'pages' or 'tabular'")
299
+
300
+ except Exception as e:
301
+ print(f"\nAn error occurred during the duplicate detection workflow: {e}")
302
+
303
+ else:
304
+ print(f"Error: Invalid task '{args.task}'.")
305
+ print("Valid options: 'redact' or 'deduplicate'")
306
+
307
+ if __name__ == "__main__":
308
+ main()
index.md DELETED
@@ -1,8 +0,0 @@
1
- ---
2
- layout: default
3
- title: Home
4
- redirect_from:
5
- - "/"
6
- ---
7
-
8
- {% include_relative README.md %}
 
 
 
 
 
 
 
 
 
lambda_entrypoint.py CHANGED
@@ -1,120 +1,149 @@
1
  import boto3
2
  import os
3
- import subprocess
4
 
5
- print("In lambda_entrypoint function")
 
6
 
7
- try:
8
- s3_client = boto3.client("s3", region_name="eu-west-2")
9
- print("s3_client is initialized:", s3_client)
10
- except Exception as e:
11
- print(f"Error initializing s3_client: {e}")
12
- raise e
13
 
14
- TMP_DIR = "/tmp/"
 
 
15
 
16
- run_direct_mode = os.getenv("RUN_DIRECT_MODE", "0")
17
-
18
- if run_direct_mode == "0":
19
- # Gradio App execution
20
- from app import app, max_queue_size, max_file_size # Replace with actual import if needed
21
- from tools.auth import authenticate_user
22
-
23
- if os.getenv("COGNITO_AUTH", "0") == "1":
24
- app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size)
25
- else:
26
- app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size)
27
 
28
  def download_file_from_s3(bucket_name, key, download_path):
29
  """Download a file from S3 to the local filesystem."""
30
- s3_client.download_file(bucket_name, key, download_path)
31
- print(f"Downloaded {key} to {download_path}")
32
-
33
- def upload_file_to_s3(file_path, bucket_name, key):
34
- """Upload a file to S3."""
35
- s3_client.upload_file(file_path, bucket_name, key)
36
- print(f"Uploaded {file_path} to {key}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def lambda_handler(event, context):
39
-
40
- print("In lambda_handler function")
41
-
42
- # Create necessary directories
43
- os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True)
44
- os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True)
45
-
46
- print("Got to record loop")
47
- print("Event records is:", event["Records"])
48
-
49
- # Extract S3 bucket and object key from the Records
50
- for record in event.get("Records", [{}]):
51
- bucket_name = record.get("s3", {}).get("bucket", {}).get("name")
52
- input_key = record.get("s3", {}).get("object", {}).get("key")
53
- print(f"Processing file {input_key} from bucket {bucket_name}")
54
-
55
- # Extract additional arguments
56
- arguments = event.get("arguments", {})
57
-
58
- if not input_key:
59
- input_key = arguments.get("input_file", "")
60
-
61
- ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)")
62
- pii_detector = arguments.get("pii_detector", "AWS Comprehend")
63
- page_min = str(arguments.get("page_min", 0))
64
- page_max = str(arguments.get("page_max", 0))
65
- allow_list = arguments.get("allow_list", None)
66
- output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- print(f"OCR Method: {ocr_method}")
69
- print(f"PII Detector: {pii_detector}")
70
- print(f"Page Range: {page_min} - {page_max}")
71
- print(f"Allow List: {allow_list}")
72
- print(f"Output Directory: {output_dir}")
73
-
74
- # Download input file
75
- input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key))
76
- download_file_from_s3(bucket_name, input_key, input_file_path)
77
-
78
- # Construct command
79
- command = [
80
- "python",
81
- "app.py",
82
- "--input_file", input_file_path,
83
- "--ocr_method", ocr_method,
84
- "--pii_detector", pii_detector,
85
- "--page_min", page_min,
86
- "--page_max", page_max,
87
- "--output_dir", output_dir,
88
- ]
89
-
90
- # Add allow_list only if provided
91
- if allow_list:
92
- allow_list_path = os.path.join(TMP_DIR, "allow_list.csv")
93
- download_file_from_s3(bucket_name, allow_list, allow_list_path)
94
- command.extend(["--allow_list", allow_list_path])
95
-
96
- print(f"Running command: {command}")
97
 
98
- try:
99
- result = subprocess.run(command, capture_output=True, text=True, check=True)
100
- print("Processing succeeded.")
101
- print(result.stdout)
102
- except subprocess.CalledProcessError as e:
103
- print("Error during processing:", e.stderr)
104
- raise e
105
- except Exception as e:
106
- print(f"Unexpected error: {str(e)}")
107
- raise e
108
-
109
- print("Now uploading files from:", output_dir)
110
-
111
- # Upload output files back to S3
112
- for root, _, files in os.walk(output_dir):
113
- for file_name in files:
114
- print("file_name:", file_name)
115
- local_file_path = os.path.join(root, file_name)
116
- output_key = f"output/{file_name}"
117
- print("Output location is:", output_key)
118
- upload_file_to_s3(local_file_path, bucket_name, output_key)
119
-
120
- return {"statusCode": 200, "body": "Processing complete."}
 
 
 
 
1
  import boto3
2
  import os
3
+ import json
4
 
5
+ # Import the main function from your CLI script
6
+ from cli_redact import main as cli_main
7
 
8
+ print("Lambda entrypoint loading...")
 
 
 
 
 
9
 
10
+ # Initialize S3 client outside the handler for connection reuse
11
+ s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
12
+ print("S3 client initialized")
13
 
14
+ # Lambda's only writable directory
15
+ TMP_DIR = "/tmp"
16
+ INPUT_DIR = os.path.join(TMP_DIR, "input")
17
+ OUTPUT_DIR = os.path.join(TMP_DIR, "output")
 
 
 
 
 
 
 
18
 
19
  def download_file_from_s3(bucket_name, key, download_path):
20
  """Download a file from S3 to the local filesystem."""
21
+ try:
22
+ s3_client.download_file(bucket_name, key, download_path)
23
+ print(f"Successfully downloaded s3://{bucket_name}/{key} to {download_path}")
24
+ except Exception as e:
25
+ print(f"Error downloading from S3: {e}")
26
+ raise
27
+
28
+ def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
29
+ """Upload all files from a local directory to an S3 prefix."""
30
+ for root, _, files in os.walk(local_directory):
31
+ for file_name in files:
32
+ local_file_path = os.path.join(root, file_name)
33
+ # Create a relative path to maintain directory structure if needed
34
+ relative_path = os.path.relpath(local_file_path, local_directory)
35
+ output_key = os.path.join(s3_prefix, relative_path)
36
+
37
+ try:
38
+ s3_client.upload_file(local_file_path, bucket_name, output_key)
39
+ print(f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}")
40
+ except Exception as e:
41
+ print(f"Error uploading to S3: {e}")
42
+ raise
43
 
44
  def lambda_handler(event, context):
45
+ print(f"Received event: {json.dumps(event)}")
46
+
47
+ # 1. Setup temporary directories
48
+ os.makedirs(INPUT_DIR, exist_ok=True)
49
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
50
+
51
+ # 2. Extract information from the event
52
+ # Assumes the event is triggered by S3 and may contain an 'arguments' payload
53
+ try:
54
+ record = event['Records'][0]
55
+ bucket_name = record['s3']['bucket']['name']
56
+ input_key = record['s3']['object']['key']
57
+
58
+ # The user metadata can be used to pass arguments
59
+ # This is more robust than embedding them in the main event body
60
+ response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
61
+ metadata = response.get('Metadata', {})
62
+ # Arguments can be passed as a JSON string in metadata
63
+ arguments = json.loads(metadata.get('arguments', '{}'))
64
+
65
+ except (KeyError, IndexError) as e:
66
+ print(f"Could not parse S3 event record: {e}. Checking for direct invocation payload.")
67
+ # Fallback for direct invocation (e.g., from Step Functions or manual test)
68
+ bucket_name = event.get('bucket_name')
69
+ input_key = event.get('input_key')
70
+ arguments = event.get('arguments', {})
71
+ if not all([bucket_name, input_key]):
72
+ raise ValueError("Missing 'bucket_name' or 'input_key' in direct invocation event.")
73
+
74
+ print(f"Processing s3://{bucket_name}/{input_key}")
75
+ print(f"With arguments: {arguments}")
76
+
77
+ # 3. Download the main input file
78
+ input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
79
+ download_file_from_s3(bucket_name, input_key, input_file_path)
80
+
81
+ # 4. Prepare arguments for the CLI function
82
+ # This dictionary should mirror the one in your app.py's "direct mode"
83
+ cli_args = {
84
+ 'task': arguments.get('task', 'redact'),
85
+ 'input_file': input_file_path,
86
+ 'output_dir': OUTPUT_DIR,
87
+ 'language': arguments.get('language', 'en_core_web_sm'),
88
+ 'pii_detector': arguments.get('pii_detector', 'Local Spacy model'), # Default to local
89
+ 'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
90
+ 'page_min': int(arguments.get('page_min', 0)),
91
+ 'page_max': int(arguments.get('page_max', 999)),
92
+
93
+ # Handle optional files like allow/deny lists
94
+ 'allow_list': None,
95
+ 'deny_list': None,
96
+
97
+ # Deduplication specific arguments
98
+ 'duplicate_type': arguments.get('duplicate_type', 'pages'),
99
+ 'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
100
+ 'min_word_count': int(arguments.get('min_word_count', 3)),
101
+ 'search_query': arguments.get('search_query'),
102
+ 'text_columns': arguments.get('text_columns', []),
103
 
104
+ # Add other arguments from your app.py as needed, using .get() for safety
105
+ 'anon_strat': arguments.get('anon_strat', 'redact'),
106
+ 'columns': arguments.get('columns', []),
107
+ 'aws_access_key': None, # Best practice: use IAM Role instead of keys
108
+ 'aws_secret_key': None,
109
+ 'aws_region': os.getenv("AWS_REGION", "eu-west-2"),
110
+ 's3_bucket': bucket_name,
111
+ # Set defaults for boolean flags
112
+ 'prepare_images': True,
113
+ 'compress_redacted_pdf': False,
114
+ 'return_pdf_end_of_redaction': True
115
+ }
116
+
117
+ # Download optional files if they are specified
118
+ allow_list_key = arguments.get('allow_list')
119
+ if allow_list_key:
120
+ allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
121
+ download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
122
+ cli_args['allow_list'] = allow_list_path
 
 
 
 
 
 
 
 
 
 
123
 
124
+ deny_list_key = arguments.get('deny_list')
125
+ if deny_list_key:
126
+ deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
127
+ download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
128
+ cli_args['deny_list'] = deny_list_path
129
+
130
+ # 5. Execute the main application logic
131
+ try:
132
+ print("--- Starting CLI Redact Main Function ---")
133
+ print(f"Arguments passed to cli_main: {cli_args}")
134
+ cli_main(direct_mode_args=cli_args)
135
+ print("--- CLI Redact Main Function Finished ---")
136
+ except Exception as e:
137
+ print(f"An error occurred during CLI execution: {e}")
138
+ # Optionally, re-raise the exception to make the Lambda fail
139
+ raise
140
+
141
+ # 6. Upload results back to S3
142
+ output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
143
+ print(f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/")
144
+ upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)
145
+
146
+ return {
147
+ "statusCode": 200,
148
+ "body": json.dumps(f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/")
149
+ }
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "1.0.0"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
@@ -23,7 +23,7 @@ dependencies = [
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.44.0",
27
  "boto3==1.40.10",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
@@ -37,7 +37,8 @@ dependencies = [
37
  "awslambdaric==3.1.1",
38
  "python-docx==1.2.0",
39
  "paddlepaddle==3.1.0",
40
- "paddleocr==3.1.1"
 
41
  ]
42
 
43
  [project.urls]
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "1.1.0"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.45.0",
27
  "boto3==1.40.10",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
 
37
  "awslambdaric==3.1.1",
38
  "python-docx==1.2.0",
39
  "paddlepaddle==3.1.0",
40
+ "paddleocr==3.1.1",
41
+ "polars==1.33.1"
42
  ]
43
 
44
  [project.urls]
requirements.txt CHANGED
@@ -6,11 +6,12 @@ presidio_analyzer==2.2.359
6
  presidio_anonymizer==2.2.359
7
  presidio-image-redactor==0.0.57
8
  pikepdf==9.10.2
9
- pandas==2.3.1
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.44.0
 
14
  boto3==1.40.10
15
  pyarrow==21.0.0
16
  openpyxl==3.1.5
 
6
  presidio_anonymizer==2.2.359
7
  presidio-image-redactor==0.0.57
8
  pikepdf==9.10.2
9
+ pandas==2.3.2
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.45.0
14
+ polars==1.33.1
15
  boto3==1.40.10
16
  pyarrow==21.0.0
17
  openpyxl==3.1.5
tools/cli_redact.py DELETED
@@ -1,164 +0,0 @@
1
- import argparse
2
- import os
3
- import pandas as pd
4
- from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
5
- from tools.helper_functions import ensure_output_folder_exists
6
- from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
7
- from tools.file_redaction import choose_and_run_redactor
8
- from tools.anonymisation import anonymise_files_with_open_text
9
-
10
- # --- Constants and Configuration ---
11
- INPUT_FOLDER = 'input/'
12
- OUTPUT_FOLDER = 'output/'
13
- DEFAULT_LANGUAGE = 'en'
14
-
15
- # Define entities for redaction
16
- chosen_comprehend_entities = [
17
- 'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
18
- 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
19
- 'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
20
- 'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
21
- 'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
22
- 'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
23
- ]
24
- chosen_redact_entities = [
25
- "TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
26
- ]
27
-
28
- # --- Main CLI Function ---
29
- def main():
30
- """
31
- A unified command-line interface to prepare, redact, and anonymise various document types.
32
- """
33
- parser = argparse.ArgumentParser(
34
- description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
35
- formatter_class=argparse.RawTextHelpFormatter
36
- )
37
-
38
- # --- General Arguments (apply to all file types) ---
39
- general_group = parser.add_argument_group('General Options')
40
- general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
41
- general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
42
- general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
43
- general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
44
- general_group.add_argument('--pii_detector',
45
- choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
46
- default=LOCAL_PII_OPTION,
47
- help='Core PII detection method (Local or AWS).')
48
- general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
49
- general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
50
-
51
- # --- PDF/Image Redaction Arguments ---
52
- pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
53
- pdf_group.add_argument('--ocr_method',
54
- choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
55
- default=TESSERACT_TEXT_EXTRACT_OPTION,
56
- help='OCR method for text extraction from images.')
57
- pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
58
- pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
59
- pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
60
- pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
61
-
62
- # --- Word/Tabular Anonymisation Arguments ---
63
- tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
64
- tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
65
- tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
66
- tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
67
- tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
68
- tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
69
-
70
- args = parser.parse_args()
71
-
72
- # --- Initial Setup ---
73
- ensure_output_folder_exists(args.output_dir)
74
- _, file_extension = os.path.splitext(args.input_file)
75
- file_extension = file_extension.lower()
76
-
77
- # Load allow/deny lists
78
- allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
79
- deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
80
-
81
-
82
- # --- Route to the Correct Workflow Based on File Type ---
83
-
84
- # Workflow 1: PDF/Image Redaction
85
- if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
86
- print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
87
- try:
88
- # Step 1: Prepare the document
89
- print("\nStep 1: Preparing document...")
90
- (
91
- prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
92
- image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
93
- ) = prepare_image_or_pdf(
94
- file_paths=[args.input_file], text_extract_method=args.ocr_method,
95
- all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
96
- first_loop_state=True, prepare_for_review=args.prepare_for_review,
97
- output_folder=args.output_dir, prepare_images=args.prepare_images
98
- )
99
- print(f"Preparation complete. {prep_summary}")
100
-
101
- # Step 2: Redact the prepared document
102
- print("\nStep 2: Running redaction...")
103
- (
104
- output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
105
- ) = choose_and_run_redactor(
106
- file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
107
- pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
108
- chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
109
- in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
110
- pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
111
- document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
112
- aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
113
- language=args.language, output_folder=args.output_dir
114
- )
115
-
116
- print("\n--- Redaction Process Complete ---")
117
- print(f"Summary: {output_summary}")
118
- print(f"\nOutput files saved to: {args.output_dir}")
119
- print("Generated Files:", sorted(output_files))
120
- if log_files: print("Log Files:", sorted(log_files))
121
-
122
- except Exception as e:
123
- print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
124
-
125
- # Workflow 2: Word/Tabular Data Anonymisation
126
- elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
127
- print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
128
- try:
129
- # Run the anonymisation function directly
130
- output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
131
- file_paths=[args.input_file],
132
- in_text="", # Not used for file-based operations
133
- anon_strat=args.anon_strat,
134
- chosen_cols=args.columns,
135
- chosen_redact_entities=chosen_redact_entities,
136
- in_allow_list=allow_list,
137
- in_excel_sheets=args.excel_sheets,
138
- first_loop_state=True,
139
- output_folder=args.output_dir,
140
- in_deny_list=deny_list,
141
- max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
142
- pii_identification_method=args.pii_detector,
143
- chosen_redact_comprehend_entities=chosen_comprehend_entities,
144
- aws_access_key_textbox=args.aws_access_key,
145
- aws_secret_key_textbox=args.aws_secret_key,
146
- language=args.language
147
- )
148
-
149
- print("\n--- Anonymisation Process Complete ---")
150
- print(f"Summary: {output_summary}")
151
- print(f"\nOutput files saved to: {args.output_dir}")
152
- print("Generated Files:", sorted(output_files))
153
- if log_files: print("Log Files:", sorted(log_files))
154
-
155
- except Exception as e:
156
- print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
157
-
158
- else:
159
- print(f"Error: Unsupported file type '{file_extension}'.")
160
- print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
161
- print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
162
-
163
- if __name__ == "__main__":
164
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/config.py CHANGED
@@ -105,7 +105,7 @@ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
105
  ###
106
  # Image options
107
  ###
108
- IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
109
  LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
110
  MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
111
 
@@ -232,6 +232,7 @@ if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == 'True':
232
  aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
233
 
234
  TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
 
235
 
236
  SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True')
237
  SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True')
@@ -266,6 +267,11 @@ TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
266
  if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
267
  TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
268
 
 
 
 
 
 
269
  ### Local OCR model - Tesseract vs PaddleOCR
270
  CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
271
 
@@ -281,11 +287,20 @@ CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITL
281
 
282
  FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
283
 
 
 
 
 
 
 
 
 
 
284
 
285
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
286
- PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
287
 
288
- MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
289
 
290
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
291
 
@@ -306,10 +321,19 @@ aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language
306
  MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
307
  LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
308
 
 
 
 
 
 
 
 
 
309
 
310
 
311
- ### File output options
312
-
 
313
  RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
314
 
315
  COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
@@ -319,27 +343,35 @@ COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") #
319
  ###
320
 
321
  TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tmp/tld/')
322
- try:
323
- extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
324
- except:
325
- extract = TLDExtract(cache_dir=None)
326
 
327
  # Get some environment variables and Launch the Gradio app
328
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
329
 
330
  RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
331
 
 
 
 
 
 
 
332
  MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
333
 
334
- MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
335
 
336
  GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
337
 
338
  ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
339
 
340
- DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
341
 
342
- GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
 
 
 
 
343
 
344
  ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
345
 
@@ -348,8 +380,27 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
348
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
349
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
350
 
351
- FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
 
 
353
 
354
  ###
355
  # COST CODE OPTIONS
 
105
  ###
106
  # Image options
107
  ###
108
+ IMAGES_DPI = float(get_or_create_env_var('IMAGES_DPI', '300.0'))
109
  LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
110
  MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
111
 
 
232
  aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
233
 
234
  TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
235
+ DO_INITIAL_TABULAR_DATA_CLEAN = get_or_create_env_var('DO_INITIAL_TABULAR_DATA_CLEAN', 'True')
236
 
237
  SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True')
238
  SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True')
 
267
  if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
268
  TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
269
 
270
+ DEFAULT_TEXT_COLUMNS = get_or_create_env_var('DEFAULT_TEXT_COLUMNS', "[]")
271
+ DEFAULT_EXCEL_SHEETS = get_or_create_env_var('DEFAULT_EXCEL_SHEETS', "[]")
272
+
273
+ DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var('DEFAULT_TABULAR_ANONYMISATION_STRATEGY', "redact completely")
274
+
275
  ### Local OCR model - Tesseract vs PaddleOCR
276
  CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
277
 
 
287
 
288
  FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
289
 
290
+ DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX = get_or_create_env_var('DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX', "['Extract handwriting']")
291
+
292
+ DEFAULT_SEARCH_QUERY = get_or_create_env_var('DEFAULT_SEARCH_QUERY', '')
293
+ DEFAULT_FUZZY_SPELLING_MISTAKES_NUM = int(get_or_create_env_var('DEFAULT_FUZZY_SPELLING_MISTAKES_NUM', '1'))
294
+
295
+ DEFAULT_PAGE_MIN = int(get_or_create_env_var('DEFAULT_PAGE_MIN', '0'))
296
+
297
+ DEFAULT_PAGE_MAX = int(get_or_create_env_var('DEFAULT_PAGE_MAX', '999'))
298
+
299
 
300
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
301
+ PAGE_BREAK_VALUE = int(get_or_create_env_var('PAGE_BREAK_VALUE', '99999'))
302
 
303
+ MAX_TIME_VALUE = int(get_or_create_env_var('MAX_TIME_VALUE', '999999'))
304
 
305
  CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
306
 
 
321
  MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
322
  LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
323
 
324
+ ###
325
+ # Duplicate detection settings
326
+ ###
327
+ DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(get_or_create_env_var("DEFAULT_DUPLICATE_DETECTION_THRESHOLD", "0.95"))
328
+ DEFAULT_MIN_CONSECUTIVE_PAGES = int(get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1"))
329
+ USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
330
+ DEFAULT_COMBINE_PAGES = get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True")
331
+ DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
332
 
333
 
334
+ ###
335
+ # File output options
336
+ ###
337
  RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
338
 
339
  COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 
343
  ###
344
 
345
  TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tmp/tld/')
346
+ try: extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
347
+ except: extract = TLDExtract(cache_dir=None)
 
 
348
 
349
  # Get some environment variables and Launch the Gradio app
350
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
351
 
352
  RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
353
 
354
+ # Direct mode configuration options
355
+ DIRECT_MODE_TASK = get_or_create_env_var('DIRECT_MODE_TASK', 'redact') # 'redact' or 'deduplicate'
356
+ DIRECT_MODE_INPUT_FILE = get_or_create_env_var('DIRECT_MODE_INPUT_FILE', '') # Path to input file
357
+ DIRECT_MODE_OUTPUT_DIR = get_or_create_env_var('DIRECT_MODE_OUTPUT_DIR', OUTPUT_FOLDER) # Output directory
358
+ DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var('DIRECT_MODE_DUPLICATE_TYPE', 'pages') # 'pages' or 'tabular'
359
+
360
  MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
361
 
362
+ MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb').lower()
363
 
364
  GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
365
 
366
  ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
367
 
368
+ DEFAULT_CONCURRENCY_LIMIT = int(get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3'))
369
 
370
+ FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
371
+
372
+ ### ALLOW LIST
373
+
374
+ GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
375
 
376
  ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
377
 
 
380
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
381
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
382
 
383
+ ### DENY LIST
384
+
385
+ GET_DEFAULT_DENY_LIST = get_or_create_env_var('GET_DEFAULT_DENY_LIST', 'False')
386
+
387
+ S3_DENY_LIST_PATH = get_or_create_env_var('S3_DENY_LIST_PATH', '') # default_deny_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
388
+
389
+ DENY_LIST_PATH = get_or_create_env_var('DENY_LIST_PATH', '') # config/default_deny_list.csv
390
+
391
+ if DENY_LIST_PATH: OUTPUT_DENY_LIST_PATH = DENY_LIST_PATH
392
+ else: OUTPUT_DENY_LIST_PATH = 'config/default_deny_list.csv'
393
+
394
+ ### WHOLE PAGE REDACTION LIST
395
+
396
+ GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST = get_or_create_env_var('GET_DEFAULT_WHOLE_PAGE_REDACTION_LIST', 'False')
397
+
398
+ S3_WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var('S3_WHOLE_PAGE_REDACTION_LIST_PATH', '') # default_whole_page_redaction_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
399
+
400
+ WHOLE_PAGE_REDACTION_LIST_PATH = get_or_create_env_var('WHOLE_PAGE_REDACTION_LIST_PATH', '') # config/default_whole_page_redaction_list.csv
401
 
402
+ if WHOLE_PAGE_REDACTION_LIST_PATH: OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = WHOLE_PAGE_REDACTION_LIST_PATH
403
+ else: OUTPUT_WHOLE_PAGE_REDACTION_LIST_PATH = 'config/default_whole_page_redaction_list.csv'
404
 
405
  ###
406
  # COST CODE OPTIONS
tools/custom_csvlogger.py CHANGED
@@ -78,7 +78,7 @@ class CSVLogger_custom(FlaggingCallback):
78
 
79
  if replacement_headers:
80
  if additional_headers is None:
81
- additional_headers = []
82
 
83
  if len(replacement_headers) != len(self.components):
84
  raise ValueError(
@@ -143,18 +143,16 @@ class CSVLogger_custom(FlaggingCallback):
143
  replacement_headers: list[str] | None = None
144
  ) -> int:
145
  if self.first_time:
146
- print("First time creating file")
147
- additional_headers = []
148
  if flag_option is not None:
149
  additional_headers.append("flag")
150
  if username is not None:
151
  additional_headers.append("username")
152
  additional_headers.append("id")
153
- #additional_headers.append("timestamp")
154
  self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
155
  self.first_time = False
156
 
157
- csv_data = []
158
  for idx, (component, sample) in enumerate(
159
  zip(self.components, flag_data, strict=False)
160
  ):
@@ -214,7 +212,6 @@ class CSVLogger_custom(FlaggingCallback):
214
  try:
215
  print("Connecting to DynamoDB via existing SSO connection")
216
  dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
217
- #client = boto3.client('dynamodb')
218
 
219
  test_connection = dynamodb.meta.client.list_tables()
220
 
@@ -224,8 +221,6 @@ class CSVLogger_custom(FlaggingCallback):
224
  print("Trying DynamoDB credentials from environment variables")
225
  dynamodb = boto3.resource('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
226
  aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
227
- # client = boto3.client('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
228
- # aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
229
  else:
230
  raise Exception("AWS credentials for DynamoDB logging not found")
231
  else:
@@ -234,12 +229,9 @@ class CSVLogger_custom(FlaggingCallback):
234
  if dynamodb_table_name is None:
235
  raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
236
 
237
- if dynamodb_headers:
238
- dynamodb_headers = dynamodb_headers
239
- if not dynamodb_headers and replacement_headers:
240
- dynamodb_headers = replacement_headers
241
- elif headers:
242
- dynamodb_headers = headers
243
  elif not dynamodb_headers:
244
  raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
245
 
@@ -261,9 +253,6 @@ class CSVLogger_custom(FlaggingCallback):
261
  except botocore.exceptions.ClientError as e:
262
  if e.response['Error']['Code'] == 'ResourceNotFoundException':
263
 
264
- #print(f"Creating DynamoDB table '{dynamodb_table_name}'...")
265
- #print("dynamodb_headers:", dynamodb_headers)
266
-
267
  attribute_definitions = [
268
  {'AttributeName': 'id', 'AttributeType': 'S'} # Only define key attributes here
269
  ]
@@ -288,18 +277,12 @@ class CSVLogger_custom(FlaggingCallback):
288
  try:
289
  item = {
290
  'id': str(generated_id), # UUID primary key
291
- #'created_by': username if username else "unknown",
292
  'timestamp': timestamp,
293
  }
294
 
295
- #print("dynamodb_headers:", dynamodb_headers)
296
- #print("csv_data:", csv_data)
297
-
298
  # Map the headers to values
299
  item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
300
 
301
- #print("item:", item)
302
-
303
  table.put_item(Item=item)
304
 
305
  print("Successfully uploaded log to DynamoDB")
 
78
 
79
  if replacement_headers:
80
  if additional_headers is None:
81
+ additional_headers = list()
82
 
83
  if len(replacement_headers) != len(self.components):
84
  raise ValueError(
 
143
  replacement_headers: list[str] | None = None
144
  ) -> int:
145
  if self.first_time:
146
+ additional_headers = list()
 
147
  if flag_option is not None:
148
  additional_headers.append("flag")
149
  if username is not None:
150
  additional_headers.append("username")
151
  additional_headers.append("id")
 
152
  self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
153
  self.first_time = False
154
 
155
+ csv_data = list()
156
  for idx, (component, sample) in enumerate(
157
  zip(self.components, flag_data, strict=False)
158
  ):
 
212
  try:
213
  print("Connecting to DynamoDB via existing SSO connection")
214
  dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
 
215
 
216
  test_connection = dynamodb.meta.client.list_tables()
217
 
 
221
  print("Trying DynamoDB credentials from environment variables")
222
  dynamodb = boto3.resource('dynamodb',aws_access_key_id=AWS_ACCESS_KEY,
223
  aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
 
 
224
  else:
225
  raise Exception("AWS credentials for DynamoDB logging not found")
226
  else:
 
229
  if dynamodb_table_name is None:
230
  raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
231
 
232
+ if dynamodb_headers: dynamodb_headers = dynamodb_headers
233
+ if not dynamodb_headers and replacement_headers: dynamodb_headers = replacement_headers
234
+ elif headers: dynamodb_headers = headers
 
 
 
235
  elif not dynamodb_headers:
236
  raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
237
 
 
253
  except botocore.exceptions.ClientError as e:
254
  if e.response['Error']['Code'] == 'ResourceNotFoundException':
255
 
 
 
 
256
  attribute_definitions = [
257
  {'AttributeName': 'id', 'AttributeType': 'S'} # Only define key attributes here
258
  ]
 
277
  try:
278
  item = {
279
  'id': str(generated_id), # UUID primary key
 
280
  'timestamp': timestamp,
281
  }
282
 
 
 
 
283
  # Map the headers to values
284
  item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
285
 
 
 
286
  table.put_item(Item=item)
287
 
288
  print("Successfully uploaded log to DynamoDB")
tools/data_anonymise.py CHANGED
@@ -6,6 +6,8 @@ import time
6
  import boto3
7
  import botocore
8
  import pandas as pd
 
 
9
  import docx
10
  import gradio as gr
11
  from openpyxl import Workbook
@@ -16,37 +18,76 @@ from botocore.client import BaseClient
16
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
17
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
18
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
19
- from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices
20
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
21
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities, create_nlp_analyser, load_spacy_model
22
  # Use custom version of analyze_dict to be able to track progress
23
  from tools.presidio_analyzer_custom import analyze_dict
24
 
 
 
25
 
26
  fake = Faker("en_UK")
27
  def fake_first_name(x):
28
  return fake.first_name()
29
 
30
- def initial_clean(text:str) -> str:
31
- #### Some of my cleaning functions
32
- html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
33
- html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
34
- non_ascii_pattern = r'[^\x00-\x7F]+'
35
- multiple_spaces_regex = r'\s{2,}'
36
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Define a list of patterns and their replacements
38
  patterns = [
 
 
 
39
  (html_pattern_regex, ' '),
40
  (html_start_pattern_end_dots_regex, ' '),
41
  (non_ascii_pattern, ' '),
42
- (multiple_spaces_regex, ' ')
 
 
43
  ]
44
 
45
  # Apply each regex replacement
46
  for pattern, replacement in patterns:
47
- text = re.sub(pattern, replacement, text)
48
 
49
- return text
 
 
 
50
 
51
  def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
52
  output = list()
@@ -275,7 +316,7 @@ def handle_docx_anonymisation(
275
 
276
  output_xlsx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.csv")
277
 
278
- anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig")
279
  doc.save(output_docx_path)
280
 
281
  with open(log_file_path, "w", encoding="utf-8-sig") as f:
@@ -304,6 +345,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
304
  aws_access_key_textbox:str='',
305
  aws_secret_key_textbox:str='',
306
  actual_time_taken_number:float=0,
 
307
  language: Optional[str] = None,
308
  progress: Progress = Progress(track_tqdm=True)):
309
  """
@@ -334,6 +376,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
334
  - actual_time_taken_number (float, optional): Time taken to do the redaction.
335
  - language (str, optional): The language of the text to anonymise.
336
  - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
 
337
  """
338
 
339
  tic = time.perf_counter()
@@ -431,7 +474,7 @@ def anonymise_files_with_open_text(file_paths: List[str],
431
  sheet_name = ""
432
  file_type = ""
433
 
434
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER)
435
  else:
436
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
437
  file_type = detect_file_type(anon_file)
@@ -482,14 +525,14 @@ def anonymise_files_with_open_text(file_paths: List[str],
482
 
483
  anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
484
 
485
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
486
 
487
  else:
488
  sheet_name = ""
489
  anon_df = read_file(anon_file)
490
  out_file_part = get_file_name_without_type(anon_file.name)
491
 
492
- out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
493
 
494
  # Increase latest file completed count unless we are at the last file
495
  if latest_file_completed != len(file_paths):
@@ -543,7 +586,8 @@ def tabular_anonymise_wrapper_func(
543
  comprehend_query_number:int=0,
544
  comprehend_client:botocore.client.BaseClient="",
545
  nlp_analyser: AnalyzerEngine = nlp_analyser,
546
- output_folder: str = OUTPUT_FOLDER
 
547
  ):
548
  """
549
  This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
@@ -570,6 +614,7 @@ def tabular_anonymise_wrapper_func(
570
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
571
  - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
572
  - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
 
573
  """
574
  def check_lists(list1, list2):
575
  return any(string in list2 for string in list1)
@@ -610,12 +655,15 @@ def tabular_anonymise_wrapper_func(
610
  # Split dataframe to keep only selected columns
611
  #print("Remaining columns to redact:", chosen_cols_in_anon_df)
612
 
 
 
 
613
  anon_df_part = anon_df[chosen_cols_in_anon_df]
614
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
615
 
616
 
617
  # Anonymise the selected columns
618
- anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser)
619
 
620
  anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
621
 
@@ -683,20 +731,35 @@ def anonymise_script(df:pd.DataFrame,
683
  comprehend_client:botocore.client.BaseClient="",
684
  custom_entities:List[str]=custom_entities,
685
  nlp_analyser: AnalyzerEngine = nlp_analyser,
686
- progress:Progress=Progress(track_tqdm=False)):
 
687
  '''
688
  Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
  '''
690
 
691
  print("Identifying personal information")
692
  analyse_tic = time.perf_counter()
693
 
694
  # Initialize analyzer_results as an empty dictionary to store results by column
695
- results_by_column = {}
696
- key_string = ""
697
-
698
- # DataFrame to dict
699
- df_dict = df.to_dict(orient="list")
700
 
701
  if isinstance(in_allow_list, list):
702
  if in_allow_list:
@@ -714,13 +777,14 @@ def anonymise_script(df:pd.DataFrame,
714
  ### Language check - check if selected language packs exist
715
  try:
716
  if language != "en":
717
- progress(0.1, desc=f"Loading SpaCy model for {language}")
718
 
719
  load_spacy_model(language)
720
 
721
  except Exception as e:
722
- print(f"Error downloading language packs for {language}: {e}")
723
- raise Exception(f"Error downloading language packs for {language}: {e}")
 
724
 
725
  # Try updating the supported languages for the spacy analyser
726
  try:
@@ -730,8 +794,9 @@ def anonymise_script(df:pd.DataFrame,
730
  gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
731
 
732
  except Exception as e:
733
- print(f"Error creating nlp_analyser for {language}: {e}")
734
- raise Exception(f"Error creating nlp_analyser for {language}: {e}")
 
735
 
736
  if isinstance(in_deny_list, pd.DataFrame):
737
  if not in_deny_list.empty:
@@ -758,6 +823,14 @@ def anonymise_script(df:pd.DataFrame,
758
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
759
  analyzer_results = list()
760
 
 
 
 
 
 
 
 
 
761
  if pii_identification_method == "Local":
762
 
763
  # Use custom analyzer to be able to track progress with Gradio
 
6
  import boto3
7
  import botocore
8
  import pandas as pd
9
+ import polars as pl
10
+ import unicodedata
11
  import docx
12
  import gradio as gr
13
  from openpyxl import Workbook
 
18
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
19
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
20
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
21
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices, DO_INITIAL_TABULAR_DATA_CLEAN
22
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
23
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities, create_nlp_analyser, load_spacy_model
24
  # Use custom version of analyze_dict to be able to track progress
25
  from tools.presidio_analyzer_custom import analyze_dict
26
 
27
+ if DO_INITIAL_TABULAR_DATA_CLEAN == "True": DO_INITIAL_TABULAR_DATA_CLEAN = True
28
+ else: DO_INITIAL_TABULAR_DATA_CLEAN = False
29
 
30
  fake = Faker("en_UK")
31
  def fake_first_name(x):
32
  return fake.first_name()
33
 
34
+ # #### Some of my cleaning functions
35
+ url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
36
+ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
37
+ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
38
+ non_ascii_pattern = r'[^\x00-\x7F]+'
39
+ and_sign_regex = r'&'
40
+ multiple_spaces_regex = r'\s{2,}'
41
+ multiple_new_lines_regex = r'(\r\n|\n)+'
42
+ multiple_punctuation_regex = r"(\p{P})\p{P}+"
43
+
44
+ def initial_clean(texts:pd.Series) -> pd.Series:
45
+ '''
46
+ This function cleans the text by removing URLs, HTML tags, and non-ASCII characters.
47
+ '''
48
+ for text in texts:
49
+ if not text or pd.isnull(text):
50
+ text = ""
51
+
52
+ # Normalize unicode characters to decompose any special forms
53
+ normalized_text = unicodedata.normalize('NFKC', text)
54
+
55
+ # Replace smart quotes and special punctuation with standard ASCII equivalents
56
+ replacements = {
57
+ '‘': "'", '’': "'", '“': '"', '”': '"',
58
+ '–': '-', '—': '-', '…': '...', '•': '*',
59
+ }
60
+
61
+ # Perform replacements
62
+ for old_char, new_char in replacements.items():
63
+ normalised_text = normalized_text.replace(old_char, new_char)
64
+
65
+ text = normalised_text
66
+
67
+ # Convert to polars Series
68
+ texts = pl.Series(texts).str.strip_chars()
69
+
70
  # Define a list of patterns and their replacements
71
  patterns = [
72
+ (multiple_new_lines_regex, ' '),
73
+ (r'\r', ''),
74
+ (url_pattern, ' '),
75
  (html_pattern_regex, ' '),
76
  (html_start_pattern_end_dots_regex, ' '),
77
  (non_ascii_pattern, ' '),
78
+ (multiple_spaces_regex, ' '),
79
+ (multiple_punctuation_regex, "${1}"),
80
+ (and_sign_regex, 'and')
81
  ]
82
 
83
  # Apply each regex replacement
84
  for pattern, replacement in patterns:
85
+ texts = texts.str.replace_all(pattern, replacement)
86
 
87
+ # Convert the series back to a list
88
+ texts = texts.to_list()
89
+
90
+ return texts
91
 
92
  def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
93
  output = list()
 
316
 
317
  output_xlsx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.csv")
318
 
319
+ anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig", index=None)
320
  doc.save(output_docx_path)
321
 
322
  with open(log_file_path, "w", encoding="utf-8-sig") as f:
 
345
  aws_access_key_textbox:str='',
346
  aws_secret_key_textbox:str='',
347
  actual_time_taken_number:float=0,
348
+ do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN,
349
  language: Optional[str] = None,
350
  progress: Progress = Progress(track_tqdm=True)):
351
  """
 
376
  - actual_time_taken_number (float, optional): Time taken to do the redaction.
377
  - language (str, optional): The language of the text to anonymise.
378
  - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
379
+ - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
380
  """
381
 
382
  tic = time.perf_counter()
 
474
  sheet_name = ""
475
  file_type = ""
476
 
477
+ out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER, do_initial_clean=do_initial_clean)
478
  else:
479
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
480
  file_type = detect_file_type(anon_file)
 
525
 
526
  anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
527
 
528
+ out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
529
 
530
  else:
531
  sheet_name = ""
532
  anon_df = read_file(anon_file)
533
  out_file_part = get_file_name_without_type(anon_file.name)
534
 
535
+ out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder, do_initial_clean=do_initial_clean)
536
 
537
  # Increase latest file completed count unless we are at the last file
538
  if latest_file_completed != len(file_paths):
 
586
  comprehend_query_number:int=0,
587
  comprehend_client:botocore.client.BaseClient="",
588
  nlp_analyser: AnalyzerEngine = nlp_analyser,
589
+ output_folder: str = OUTPUT_FOLDER,
590
+ do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN
591
  ):
592
  """
593
  This function wraps the anonymisation process for a given dataframe. It filters the dataframe based on chosen columns, applies the specified anonymisation strategy using the anonymise_script function, and exports the anonymised data to a file.
 
614
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
615
  - comprehend_client (optional): The client object from AWS containing a client connection to AWS Comprehend if that option is chosen on the first tab.
616
  - output_folder: The folder where the anonymized files will be saved. Defaults to the 'output_folder' variable.
617
+ - do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
618
  """
619
  def check_lists(list1, list2):
620
  return any(string in list2 for string in list1)
 
655
  # Split dataframe to keep only selected columns
656
  #print("Remaining columns to redact:", chosen_cols_in_anon_df)
657
 
658
+ if not anon_df.index.is_unique:
659
+ anon_df = anon_df.reset_index(drop=True)
660
+
661
  anon_df_part = anon_df[chosen_cols_in_anon_df]
662
  anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
663
 
664
 
665
  # Anonymise the selected columns
666
+ anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser, do_initial_clean=do_initial_clean)
667
 
668
  anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
669
 
 
731
  comprehend_client:botocore.client.BaseClient="",
732
  custom_entities:List[str]=custom_entities,
733
  nlp_analyser: AnalyzerEngine = nlp_analyser,
734
+ do_initial_clean:bool=DO_INITIAL_TABULAR_DATA_CLEAN,
735
+ progress:Progress=Progress(track_tqdm=True)):
736
  '''
737
  Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
738
+
739
+ Args:
740
+ df (pd.DataFrame): The input DataFrame containing text to be anonymised.
741
+ anon_strat (str): The anonymisation strategy to apply (e.g., "replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely").
742
+ language (str): The language of the text for analysis (e.g., "en", "es").
743
+ chosen_redact_entities (List[str]): A list of entity types to redact using the local (Presidio) method.
744
+ in_allow_list (List[str], optional): A list of terms to explicitly allow and not redact. Defaults to an empty list.
745
+ in_deny_list (List[str], optional): A list of terms to explicitly deny and always redact. Defaults to an empty list.
746
+ max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of fuzzy spelling mistakes to tolerate for custom recognizers. Defaults to 0.
747
+ pii_identification_method (str, optional): The method for PII identification ("Local", "AWS Comprehend", or "Both"). Defaults to "Local".
748
+ chosen_redact_comprehend_entities (List[str], optional): A list of entity types to redact using AWS Comprehend. Defaults to an empty list.
749
+ comprehend_query_number (int, optional): The number of queries to send to AWS Comprehend per batch. Defaults to 0.
750
+ comprehend_client (botocore.client.BaseClient, optional): An initialized AWS Comprehend client. Defaults to an empty string.
751
+ custom_entities (List[str], optional): A list of custom entities to be recognized. Defaults to `custom_entities`.
752
+ nlp_analyser (AnalyzerEngine, optional): The Presidio AnalyzerEngine instance to use. Defaults to `nlp_analyser`.
753
+ do_initial_clean (bool, optional): Whether to perform an initial cleaning of the text. Defaults to True.
754
+ progress (Progress, optional): Gradio Progress object for tracking progress. Defaults to Progress(track_tqdm=False).
755
  '''
756
 
757
  print("Identifying personal information")
758
  analyse_tic = time.perf_counter()
759
 
760
  # Initialize analyzer_results as an empty dictionary to store results by column
761
+ results_by_column = dict()
762
+ key_string = ""
 
 
 
763
 
764
  if isinstance(in_allow_list, list):
765
  if in_allow_list:
 
777
  ### Language check - check if selected language packs exist
778
  try:
779
  if language != "en":
780
+ progress(0.1, desc=f"Loading spaCy model for {language}")
781
 
782
  load_spacy_model(language)
783
 
784
  except Exception as e:
785
+ out_message = f"Error downloading language packs for {language}: {e}"
786
+ print(out_message)
787
+ raise Exception(out_message)
788
 
789
  # Try updating the supported languages for the spacy analyser
790
  try:
 
794
  gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
795
 
796
  except Exception as e:
797
+ out_message = f"Error creating nlp_analyser for {language}: {e}"
798
+ print(out_message)
799
+ raise Exception(out_message)
800
 
801
  if isinstance(in_deny_list, pd.DataFrame):
802
  if not in_deny_list.empty:
 
823
  batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
824
  analyzer_results = list()
825
 
826
+ if do_initial_clean:
827
+ progress(0.2, desc="Cleaning text")
828
+ for col in progress.tqdm(df.columns, desc="Cleaning text", unit = "Columns"):
829
+ df[col] = initial_clean(df[col])
830
+
831
+ # DataFrame to dict
832
+ df_dict = df.to_dict(orient="list")
833
+
834
  if pii_identification_method == "Local":
835
 
836
  # Use custom analyzer to be able to track progress with Gradio
tools/example_cli_calls.txt CHANGED
@@ -21,4 +21,10 @@ python your_cli_script.py \
21
  --output_dir "output/anonymised_docs/" \
22
  --anon_strat "encrypt" \
23
  --deny_list "config/codenames.csv" \
24
- --language "en"
 
 
 
 
 
 
 
21
  --output_dir "output/anonymised_docs/" \
22
  --anon_strat "encrypt" \
23
  --deny_list "config/codenames.csv" \
24
+ --language "en"
25
+
26
+ python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --min_word_count 5
27
+
28
+ python cli_redact.py --task deduplicate --input_file data.csv --duplicate_type tabular --text_columns "Name" "Email" "Description"
29
+
30
+ python cli_redact.py --task deduplicate --input_file ocr_output.csv --duplicate_type pages --search_query "confidential information"
tools/file_redaction.py CHANGED
@@ -92,7 +92,7 @@ def choose_and_run_redactor(file_paths:List[str],
92
  chosen_redact_comprehend_entities:List[str],
93
  text_extraction_method:str,
94
  in_allow_list:List[List[str]]=list(),
95
- custom_recogniser_word_list:List[str]=list(),
96
  redact_whole_page_list:List[str]=list(),
97
  latest_file_completed:int=0,
98
  combined_out_message:List=list(),
@@ -147,8 +147,8 @@ def choose_and_run_redactor(file_paths:List[str],
147
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
148
  - text_extraction_method (str): The method to use to extract text from documents.
149
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
150
- - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
151
- - redact_whole_page_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
152
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
153
  - combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
154
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
@@ -390,11 +390,11 @@ def choose_and_run_redactor(file_paths:List[str],
390
  in_allow_list_flat = list()
391
 
392
  # If string, assume file path
393
- if isinstance(custom_recogniser_word_list, str):
394
- custom_recogniser_word_list = pd.read_csv(custom_recogniser_word_list)
395
- if isinstance(custom_recogniser_word_list, pd.DataFrame):
396
- if not custom_recogniser_word_list.empty:
397
- custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
398
  else:
399
  custom_recogniser_word_list_flat = list()
400
 
@@ -1383,7 +1383,7 @@ def redact_image_pdf(file_path:str,
1383
  comprehend_query_number:int=0,
1384
  comprehend_client:str="",
1385
  textract_client:str="",
1386
- custom_recogniser_word_list:List[str]=list(),
1387
  redact_whole_page_list:List[str]=list(),
1388
  max_fuzzy_spelling_mistakes_num:int=1,
1389
  match_fuzzy_whole_phrase_bool:bool=True,
@@ -1423,7 +1423,7 @@ def redact_image_pdf(file_path:str,
1423
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
1424
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1425
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
1426
- - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
1427
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
1428
  - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
1429
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
@@ -1459,13 +1459,13 @@ def redact_image_pdf(file_path:str,
1459
  raise Exception(f"Error creating nlp_analyser for {language}: {e}")
1460
 
1461
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1462
- if custom_recogniser_word_list:
1463
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1464
- new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
1465
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1466
 
1467
  nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
1468
- new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1469
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1470
 
1471
  # Only load in PaddleOCR models if not running Textract
@@ -2216,7 +2216,7 @@ def redact_text_pdf(
2216
  pii_identification_method: str = "Local",
2217
  comprehend_query_number:int = 0,
2218
  comprehend_client="",
2219
- custom_recogniser_word_list:List[str]=list(),
2220
  redact_whole_page_list:List[str]=list(),
2221
  max_fuzzy_spelling_mistakes_num:int=1,
2222
  match_fuzzy_whole_phrase_bool:bool=True,
@@ -2250,7 +2250,7 @@ def redact_text_pdf(
2250
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
2251
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
2252
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
2253
- - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
2254
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
2255
  - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
2256
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
@@ -2290,13 +2290,13 @@ def redact_text_pdf(
2290
  raise Exception(f"Error creating nlp_analyser for {language}: {e}")
2291
 
2292
  # Update custom word list analyser object with any new words that have been added to the custom deny list
2293
- if custom_recogniser_word_list:
2294
  nlp_analyser.registry.remove_recognizer("CUSTOM")
2295
- new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
2296
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
2297
 
2298
  nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
2299
- new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
2300
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
2301
 
2302
  # Open with Pikepdf to get text lines
@@ -2385,9 +2385,7 @@ def redact_text_pdf(
2385
  all_page_line_text_extraction_characters.extend(line_characters)
2386
  all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
2387
 
2388
- #print("page_text_ocr_outputs_list:", page_text_ocr_outputs_list)
2389
  page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
2390
- #page_text_ocr_outputs.to_csv("output/page_text_ocr_outputs.csv")
2391
 
2392
  ### REDACTION
2393
  if pii_identification_method != NO_REDACTION_PII_OPTION:
 
92
  chosen_redact_comprehend_entities:List[str],
93
  text_extraction_method:str,
94
  in_allow_list:List[List[str]]=list(),
95
+ in_deny_list:List[str]=list(),
96
  redact_whole_page_list:List[str]=list(),
97
  latest_file_completed:int=0,
98
  combined_out_message:List=list(),
 
147
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
148
  - text_extraction_method (str): The method to use to extract text from documents.
149
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
150
+ - in_deny_list (List[List[str]], optional): A list of denied terms for redaction. Defaults to None.
151
+ - redact_whole_page_list (List[List[str]], optional): A list of whole page numbers for redaction. Defaults to None.
152
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
153
  - combined_out_message (list, optional): A list to store output messages. Defaults to an empty list.
154
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
 
390
  in_allow_list_flat = list()
391
 
392
  # If string, assume file path
393
+ if isinstance(in_deny_list, str):
394
+ in_deny_list = pd.read_csv(in_deny_list)
395
+ if isinstance(in_deny_list, pd.DataFrame):
396
+ if not in_deny_list.empty:
397
+ custom_recogniser_word_list_flat = in_deny_list.iloc[:, 0].tolist()
398
  else:
399
  custom_recogniser_word_list_flat = list()
400
 
 
1383
  comprehend_query_number:int=0,
1384
  comprehend_client:str="",
1385
  textract_client:str="",
1386
+ in_deny_list:List[str]=list(),
1387
  redact_whole_page_list:List[str]=list(),
1388
  max_fuzzy_spelling_mistakes_num:int=1,
1389
  match_fuzzy_whole_phrase_bool:bool=True,
 
1423
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
1424
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1425
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
1426
+ - in_deny_list (optional): A list of custom words that the user has chosen specifically to redact.
1427
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
1428
  - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
1429
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
 
1459
  raise Exception(f"Error creating nlp_analyser for {language}: {e}")
1460
 
1461
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1462
+ if in_deny_list:
1463
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1464
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
1465
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1466
 
1467
  nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
1468
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1469
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1470
 
1471
  # Only load in PaddleOCR models if not running Textract
 
2216
  pii_identification_method: str = "Local",
2217
  comprehend_query_number:int = 0,
2218
  comprehend_client="",
2219
+ in_deny_list:List[str]=list(),
2220
  redact_whole_page_list:List[str]=list(),
2221
  max_fuzzy_spelling_mistakes_num:int=1,
2222
  match_fuzzy_whole_phrase_bool:bool=True,
 
2250
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
2251
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
2252
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
2253
+ - in_deny_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
2254
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
2255
  - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
2256
  - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
 
2290
  raise Exception(f"Error creating nlp_analyser for {language}: {e}")
2291
 
2292
  # Update custom word list analyser object with any new words that have been added to the custom deny list
2293
+ if in_deny_list:
2294
  nlp_analyser.registry.remove_recognizer("CUSTOM")
2295
+ new_custom_recogniser = custom_word_list_recogniser(in_deny_list)
2296
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
2297
 
2298
  nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
2299
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=in_deny_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
2300
  nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
2301
 
2302
  # Open with Pikepdf to get text lines
 
2385
  all_page_line_text_extraction_characters.extend(line_characters)
2386
  all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
2387
 
 
2388
  page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
 
2389
 
2390
  ### REDACTION
2391
  if pii_identification_method != NO_REDACTION_PII_OPTION:
tools/find_duplicate_pages.py CHANGED
@@ -1,8 +1,6 @@
1
  import pandas as pd
2
  import os
3
  import re
4
- import itertools
5
- import numpy as np
6
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
@@ -16,9 +14,10 @@ from tools.helper_functions import OUTPUT_FOLDER
16
  from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
17
  from tools.load_spacy_model_custom_recognisers import nlp
18
 
19
- similarity_threshold = 0.95
20
  number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
21
  ID_MULTIPLIER = 100000
 
 
22
 
23
  def split_text_with_punctuation(text: str) -> List[str]:
24
  """
@@ -604,8 +603,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
604
 
605
  return output_paths
606
 
607
- # Define the set of punctuation characters for efficient lookup
608
- PUNCTUATION_TO_STRIP = {'.', ',', '?', '!', ':', ';'}
609
 
610
  def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
611
  """
 
1
  import pandas as pd
2
  import os
3
  import re
 
 
4
 
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
 
14
  from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
15
  from tools.load_spacy_model_custom_recognisers import nlp
16
 
 
17
  number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
18
  ID_MULTIPLIER = 100000
19
+ # Define the set of punctuation characters for efficient lookup
20
+ PUNCTUATION_TO_STRIP = {'.', ',', '?', '!', ':', ';'}
21
 
22
  def split_text_with_punctuation(text: str) -> List[str]:
23
  """
 
603
 
604
  return output_paths
605
 
606
+
 
607
 
608
  def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
609
  """
tools/find_duplicate_tabular.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import re
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from typing import List, Tuple, Dict
7
+ import gradio as gr
8
+ from gradio import Progress
9
+ from pathlib import Path
10
+ from tools.helper_functions import OUTPUT_FOLDER, read_file
11
+ from tools.data_anonymise import initial_clean
12
+ from tools.load_spacy_model_custom_recognisers import nlp
13
+ from tools.config import DO_INITIAL_TABULAR_DATA_CLEAN
14
+
15
+ similarity_threshold = 0.95
16
+
17
+ def clean_and_stem_text_series(df: pd.DataFrame, column: str, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
18
+ """
19
+ Clean and stem text columns in a data frame for tabular data
20
+ """
21
+
22
+ # Function to apply lemmatisation and remove stopwords
23
+ def _apply_lemmatization(text):
24
+ doc = nlp(text)
25
+ # Keep only alphabetic tokens and remove stopwords
26
+ lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
27
+ return ' '.join(lemmatized_words)
28
+
29
+ if do_initial_clean_dup:
30
+ df['text_clean'] = initial_clean(df[column])
31
+
32
+ df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
33
+ df['text_clean'] = df[column].str.lower()#.str.replace(r'[^\w\s]', '', regex=True)
34
+
35
+ return df
36
+
37
+ def convert_tabular_data_to_analysis_format(
38
+ df: pd.DataFrame,
39
+ file_name: str,
40
+ text_columns: List[str] = None
41
+ ) -> List[Tuple[str, pd.DataFrame]]:
42
+ """
43
+ Convert tabular data (CSV/XLSX) to the format needed for duplicate analysis.
44
+
45
+ Args:
46
+ df (pd.DataFrame): The input DataFrame
47
+ file_name (str): Name of the file
48
+ text_columns (List[str], optional): Columns to analyze for duplicates.
49
+ If None, uses all string columns.
50
+
51
+ Returns:
52
+ List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
53
+ """
54
+ if text_columns is None:
55
+ # Auto-detect text columns (string type columns)
56
+ text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
57
+
58
+ if not text_columns:
59
+ print(f"No text columns found in {file_name}")
60
+ return []
61
+
62
+ # Create a copy to avoid modifying original
63
+ df_copy = df.copy()
64
+
65
+ # Create a combined text column from all text columns
66
+ df_copy['combined_text'] = df_copy[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
67
+
68
+ # Add row identifier
69
+ df_copy['row_id'] = df_copy.index
70
+
71
+ # Create the format expected by the duplicate detection system
72
+ # Using 'page' as row number and 'text' as the combined text
73
+ processed_df = pd.DataFrame({
74
+ 'page': df_copy['row_id'],
75
+ 'text': df_copy['combined_text'],
76
+ 'file': file_name
77
+ })
78
+
79
+ # Add original row data for reference
80
+ for col in text_columns:
81
+ processed_df[f'original_{col}'] = df_copy[col]
82
+
83
+ return [(file_name, processed_df)]
84
+
85
+ def find_duplicate_cells_in_tabular_data(
86
+ input_files: List[str],
87
+ similarity_threshold: float = 0.95,
88
+ min_word_count: int = 3,
89
+ text_columns: List[str] = None,
90
+ output_folder: str = OUTPUT_FOLDER,
91
+ do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
92
+ progress: Progress = Progress(track_tqdm=True)
93
+ ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
94
+ """
95
+ Find duplicate cells/text in tabular data files (CSV, XLSX).
96
+
97
+ Args:
98
+ input_files (List[str]): List of file paths to analyze
99
+ similarity_threshold (float): Minimum similarity score to consider duplicates
100
+ min_word_count (int): Minimum word count for text to be considered
101
+ text_columns (List[str], optional): Specific columns to analyze
102
+ output_folder (str, optional): Output folder for results
103
+ do_initial_clean_dup (bool, optional): Whether to do initial clean of text
104
+ progress (Progress): Progress tracking object
105
+
106
+ Returns:
107
+ Tuple containing:
108
+ - results_df: DataFrame with duplicate matches
109
+ - output_paths: List of output file paths
110
+ - full_data_by_file: Dictionary of processed data by file
111
+ """
112
+
113
+ if not input_files:
114
+ raise gr.Error("Please upload files to analyze.")
115
+
116
+ progress(0.1, desc="Loading and processing files...")
117
+
118
+ all_data_to_process = []
119
+ full_data_by_file = {}
120
+ file_paths = []
121
+
122
+ # Process each file
123
+ for file_path in input_files:
124
+ try:
125
+ df = read_file(file_path)
126
+
127
+ file_name = os.path.basename(file_path)
128
+ file_paths.append(file_path)
129
+
130
+ # Convert to analysis format
131
+ processed_data = convert_tabular_data_to_analysis_format(
132
+ df, file_name, text_columns
133
+ )
134
+
135
+ if processed_data:
136
+ all_data_to_process.extend(processed_data)
137
+ full_data_by_file[file_name] = processed_data[0][1]
138
+
139
+ except Exception as e:
140
+ print(f"Error processing {file_path}: {e}")
141
+ continue
142
+
143
+ if not all_data_to_process:
144
+ raise gr.Error("No valid data found in uploaded files.")
145
+
146
+ progress(0.2, desc="Combining data...")
147
+
148
+ # Combine all data
149
+ combined_df = pd.concat([data[1] for data in all_data_to_process], ignore_index=True)
150
+
151
+ progress(0.3, desc="Cleaning and preparing text...")
152
+
153
+ # Clean and prepare text
154
+ combined_df = clean_and_stem_text_series(combined_df, 'text', do_initial_clean_dup=do_initial_clean_dup)
155
+
156
+ # Filter by minimum word count
157
+ combined_df['word_count'] = combined_df['text_clean'].str.split().str.len().fillna(0)
158
+ combined_df = combined_df[combined_df['word_count'] >= min_word_count].copy()
159
+
160
+ if len(combined_df) < 2:
161
+ return pd.DataFrame(), [], full_data_by_file
162
+
163
+ progress(0.4, desc="Calculating similarities...")
164
+
165
+ # Calculate similarities
166
+ vectorizer = TfidfVectorizer()
167
+ tfidf_matrix = vectorizer.fit_transform(combined_df['text_clean'])
168
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
169
+
170
+ # Find similar pairs
171
+ coo_matrix = similarity_matrix.tocoo()
172
+ similar_pairs = [
173
+ (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
174
+ if r < c and v >= similarity_threshold
175
+ ]
176
+
177
+ if not similar_pairs:
178
+ gr.Info("No duplicate cells found.")
179
+ return pd.DataFrame(), [], full_data_by_file
180
+
181
+ progress(0.7, desc="Processing results...")
182
+
183
+ # Create results DataFrame
184
+ results_data = []
185
+ for row1, row2, similarity in similar_pairs:
186
+ row1_data = combined_df.iloc[row1]
187
+ row2_data = combined_df.iloc[row2]
188
+
189
+ results_data.append({
190
+ 'File1': row1_data['file'],
191
+ 'Row1': int(row1_data['page']),
192
+ 'File2': row2_data['file'],
193
+ 'Row2': int(row2_data['page']),
194
+ 'Similarity_Score': round(similarity, 3),
195
+ 'Text1': row1_data['text'][:200] + '...' if len(row1_data['text']) > 200 else row1_data['text'],
196
+ 'Text2': row2_data['text'][:200] + '...' if len(row2_data['text']) > 200 else row2_data['text'],
197
+ 'Original_Index1': row1,
198
+ 'Original_Index2': row2
199
+ })
200
+
201
+ results_df = pd.DataFrame(results_data)
202
+ results_df = results_df.sort_values(['File1', 'Row1', 'File2', 'Row2'])
203
+
204
+ progress(0.9, desc="Saving results...")
205
+
206
+ # Save results
207
+ output_paths = save_tabular_duplicate_results(results_df, output_folder, file_paths, file_replaced_index=0)
208
+
209
+ gr.Info(f"Found {len(results_df)} duplicate cell matches")
210
+
211
+ return results_df, output_paths, full_data_by_file
212
+
213
+ def save_tabular_duplicate_results(results_df: pd.DataFrame, output_folder: str, file_paths: List[str], file_replaced_index: int = 0) -> List[str]:
214
+ """
215
+ Save tabular duplicate detection results to files.
216
+
217
+ Args:
218
+ results_df (pd.DataFrame): Results DataFrame
219
+ output_folder (str): Output folder path
220
+ file_paths (List[str]): List of file paths
221
+ file_replaced_index (int): Index of the file to replace with duplicate rows removed
222
+ (0 is the first file in the list)
223
+ Returns:
224
+ List[str]: List of output file paths
225
+ """
226
+ output_paths = []
227
+ output_folder_path = Path(output_folder)
228
+ output_folder_path.mkdir(exist_ok=True)
229
+
230
+ if results_df.empty:
231
+ print("No duplicate matches to save.")
232
+ return []
233
+
234
+ # Save main results
235
+ results_file = output_folder_path / 'tabular_duplicate_results.csv'
236
+ results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
237
+ output_paths.append(str(results_file))
238
+
239
+ # Save per-file duplicate lists
240
+ for file_name, group in results_df.groupby('File1'):
241
+ file_stem = Path(file_name).stem
242
+ duplicate_rows_file = output_folder_path / f"{file_stem}_duplicate_rows.csv"
243
+
244
+ # Get unique row numbers to remove
245
+ rows_to_remove = sorted(group['Row1'].unique())
246
+ duplicate_df = pd.DataFrame({'Row_to_Remove': rows_to_remove})
247
+ duplicate_df.to_csv(duplicate_rows_file, index=False)
248
+ output_paths.append(str(duplicate_rows_file))
249
+
250
+ # Save also original file (first file in list) with duplicate rows removed
251
+ file_path = file_paths[file_replaced_index]
252
+ file_base_name = os.path.basename(file_path)
253
+ df = read_file(file_path)
254
+ df_cleaned = df.drop(index=rows_to_remove).reset_index(drop=True)
255
+
256
+ output_path = os.path.join(output_folder, f"{file_base_name}_deduplicated.csv")
257
+ df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
258
+
259
+ output_paths.append(str(output_path))
260
+
261
+ return output_paths
262
+
263
+ def remove_duplicate_rows_from_tabular_data(
264
+ file_path: str,
265
+ duplicate_rows: List[int],
266
+ output_folder: str = OUTPUT_FOLDER
267
+ ) -> str:
268
+ """
269
+ Remove duplicate rows from a tabular data file.
270
+
271
+ Args:
272
+ file_path (str): Path to the input file
273
+ duplicate_rows (List[int]): List of row indices to remove
274
+ output_folder (str): Output folder for cleaned file
275
+
276
+ Returns:
277
+ str: Path to the cleaned file
278
+ """
279
+ try:
280
+ # Load the file
281
+ df = read_file(file_path)
282
+
283
+ # Remove duplicate rows (0-indexed)
284
+ df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
285
+
286
+ # Save cleaned file
287
+ file_name = os.path.basename(file_path)
288
+ file_stem = os.path.splitext(file_name)[0]
289
+ file_ext = os.path.splitext(file_name)[1]
290
+
291
+ output_path = os.path.join(output_folder, f"{file_stem}_deduplicated{file_ext}")
292
+
293
+ if file_ext in ['.xlsx', '.xls']:
294
+ df_cleaned.to_excel(output_path, index=False)
295
+ elif file_ext in ['.parquet']:
296
+ df_cleaned.to_parquet(output_path, index=False)
297
+ else:
298
+ df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
299
+
300
+ return output_path
301
+
302
+ except Exception as e:
303
+ print(f"Error removing duplicates from {file_path}: {e}")
304
+ raise
305
+
306
+ def run_tabular_duplicate_analysis(
307
+ files: List[str],
308
+ threshold: float,
309
+ min_words: int,
310
+ text_columns: List[str] = None,
311
+ output_folder: str = OUTPUT_FOLDER,
312
+ do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
313
+ progress: Progress = Progress(track_tqdm=True)
314
+ ) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
315
+ """
316
+ Main function to run tabular duplicate analysis.
317
+
318
+ Args:
319
+ files (List[str]): List of file paths
320
+ threshold (float): Similarity threshold
321
+ min_words (int): Minimum word count
322
+ text_columns (List[str], optional): Specific columns to analyze
323
+ output_folder (str, optional): Output folder for results
324
+ progress (Progress): Progress tracking
325
+
326
+ Returns:
327
+ Tuple containing results DataFrame, output paths, and full data by file
328
+ """
329
+ return find_duplicate_cells_in_tabular_data(
330
+ input_files=files,
331
+ similarity_threshold=threshold,
332
+ min_word_count=min_words,
333
+ text_columns=text_columns,
334
+ output_folder=output_folder,
335
+ do_initial_clean_dup=do_initial_clean_dup,
336
+ progress=progress
337
+ )
338
+
339
+
340
+
341
+ # Function to update column choices when files are uploaded
342
+ def update_tabular_column_choices(files):
343
+ if not files:
344
+ return gr.update(choices=[])
345
+
346
+ all_columns = set()
347
+ for file in files:
348
+ try:
349
+ df = read_file(file.name)
350
+
351
+ # Get text columns
352
+ text_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
353
+ all_columns.update(text_cols)
354
+ except Exception as e:
355
+ print(f"Error reading {file.name}: {e}")
356
+ continue
357
+
358
+ return gr.Dropdown(choices=sorted(list(all_columns)))
359
+
360
+ # Function to handle tabular duplicate detection
361
+ def run_tabular_duplicate_detection(files, threshold, min_words, text_columns, output_folder: str = OUTPUT_FOLDER, do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN):
362
+ if not files:
363
+ return pd.DataFrame(), [], gr.Dropdown(choices=[])
364
+
365
+ file_paths = [f.name for f in files]
366
+ results_df, output_paths, full_data = run_tabular_duplicate_analysis(
367
+ files=file_paths,
368
+ threshold=threshold,
369
+ min_words=min_words,
370
+ text_columns=text_columns if text_columns else None,
371
+ output_folder=output_folder,
372
+ do_initial_clean_dup=do_initial_clean_dup
373
+ )
374
+
375
+ print("output_paths:", output_paths)
376
+
377
+ # Update file choices for cleaning
378
+ file_choices = list(set([f for f in file_paths]))
379
+
380
+ return results_df, output_paths, gr.Dropdown(choices=file_choices)
381
+
382
+ # Function to handle row selection for preview
383
+ def handle_tabular_row_selection(results_df, evt:gr.SelectData):
384
+
385
+ if not evt:
386
+ return None, "", ""
387
+
388
+ if not isinstance(results_df, pd.DataFrame):
389
+ return None, "", ""
390
+ elif results_df.empty:
391
+ return None, "", ""
392
+
393
+ selected_index = evt.index[0]
394
+ if selected_index >= len(results_df):
395
+ return None, "", ""
396
+
397
+ row = results_df.iloc[selected_index]
398
+ return selected_index, row['Text1'], row['Text2']
399
+
400
+ # Function to clean duplicates from selected file
401
+ def clean_tabular_duplicates(file_name, results_df, output_folder):
402
+ if not file_name or results_df.empty:
403
+ return None
404
+
405
+ # Get duplicate rows for this file
406
+ file_duplicates = results_df[results_df['File1'] == file_name]['Row1'].tolist()
407
+
408
+ if not file_duplicates:
409
+ return None
410
+
411
+ try:
412
+ # Find the original file path
413
+ # This is a simplified approach - in practice you might want to store file paths
414
+ cleaned_file = remove_duplicate_rows_from_tabular_data(
415
+ file_path=file_name,
416
+ duplicate_rows=file_duplicates,
417
+ output_folder=output_folder
418
+ )
419
+ return cleaned_file
420
+ except Exception as e:
421
+ print(f"Error cleaning duplicates: {e}")
422
+ return None
tools/helper_functions.py CHANGED
@@ -132,26 +132,20 @@ def get_file_name_without_type(file_path):
132
 
133
  def detect_file_type(filename:str):
134
  """Detect the file type based on its extension."""
135
- if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
136
- return 'csv'
137
- elif filename.endswith('.xlsx'):
138
- return 'xlsx'
139
- elif filename.endswith('.parquet'):
140
- return 'parquet'
141
- elif filename.endswith('.pdf'):
142
- return 'pdf'
143
- elif filename.endswith('.jpg'):
144
- return 'jpg'
145
- elif filename.endswith('.jpeg'):
146
- return 'jpeg'
147
- elif filename.endswith('.png'):
148
- return 'png'
149
- elif filename.endswith('.xfdf'):
150
- return 'xfdf'
151
- elif filename.endswith('.docx'):
152
- return 'docx'
153
- else:
154
- raise ValueError("Unsupported file type.")
155
 
156
  def read_file(filename:str):
157
  """Read the file based on its detected type."""
 
132
 
133
  def detect_file_type(filename:str):
134
  """Detect the file type based on its extension."""
135
+ if not isinstance(filename, str):
136
+ filename = str(filename)
137
+
138
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')): return 'csv'
139
+ elif filename.endswith('.xlsx'): return 'xlsx'
140
+ elif filename.endswith('.xls'): return 'xls'
141
+ elif filename.endswith('.parquet'): return 'parquet'
142
+ elif filename.endswith('.pdf'): return 'pdf'
143
+ elif filename.endswith('.jpg'): return 'jpg'
144
+ elif filename.endswith('.jpeg'): return 'jpeg'
145
+ elif filename.endswith('.png'): return 'png'
146
+ elif filename.endswith('.xfdf'): return 'xfdf'
147
+ elif filename.endswith('.docx'): return 'docx'
148
+ else: raise ValueError("Unsupported file type.")
 
 
 
 
 
 
149
 
150
  def read_file(filename:str):
151
  """Read the file based on its detected type."""