Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / lambda_entrypoint.py

Added form, table, and layout extraction options to AWS Textract calls. Added options to config to bound document length, maximum table rows, etc.

d3e6a24 about 2 months ago

raw

history blame

10 kB

	import boto3
	import os
	import json

	# Import the main function from your CLI script
	from cli_redact import main as cli_main

	print("Lambda entrypoint loading...")

	# Initialize S3 client outside the handler for connection reuse
	s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
	print("S3 client initialised")

	# Lambda's only writable directory
	TMP_DIR = "/tmp"
	INPUT_DIR = os.path.join(TMP_DIR, "input")
	OUTPUT_DIR = os.path.join(TMP_DIR, "output")

	def download_file_from_s3(bucket_name, key, download_path):
	"""Download a file from S3 to the local filesystem."""
	try:
	s3_client.download_file(bucket_name, key, download_path)
	print(f"Successfully downloaded s3://{bucket_name}/{key} to {download_path}")
	except Exception as e:
	print(f"Error downloading from S3: {e}")
	raise

	def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
	"""Upload all files from a local directory to an S3 prefix."""
	for root, _, files in os.walk(local_directory):
	for file_name in files:
	local_file_path = os.path.join(root, file_name)
	# Create a relative path to maintain directory structure if needed
	relative_path = os.path.relpath(local_file_path, local_directory)
	output_key = os.path.join(s3_prefix, relative_path)

	try:
	s3_client.upload_file(local_file_path, bucket_name, output_key)
	print(f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}")
	except Exception as e:
	print(f"Error uploading to S3: {e}")
	raise

	def lambda_handler(event, context):
	print(f"Received event: {json.dumps(event)}")

	# 1. Setup temporary directories
	os.makedirs(INPUT_DIR, exist_ok=True)
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# 2. Extract information from the event
	# Assumes the event is triggered by S3 and may contain an 'arguments' payload
	try:
	record = event['Records'][0]
	bucket_name = record['s3']['bucket']['name']
	input_key = record['s3']['object']['key']

	# The user metadata can be used to pass arguments
	# This is more robust than embedding them in the main event body
	response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
	metadata = response.get('Metadata', {})
	# Arguments can be passed as a JSON string in metadata
	arguments = json.loads(metadata.get('arguments', '{}'))

	except (KeyError, IndexError) as e:
	print(f"Could not parse S3 event record: {e}. Checking for direct invocation payload.")
	# Fallback for direct invocation (e.g., from Step Functions or manual test)
	bucket_name = event.get('bucket_name')
	input_key = event.get('input_key')
	arguments = event.get('arguments', {})
	if not all([bucket_name, input_key]):
	raise ValueError("Missing 'bucket_name' or 'input_key' in direct invocation event.")

	print(f"Processing s3://{bucket_name}/{input_key}")
	print(f"With arguments: {arguments}")

	# 3. Download the main input file
	input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
	download_file_from_s3(bucket_name, input_key, input_file_path)

	# 4. Prepare arguments for the CLI function
	# This dictionary should mirror the one in your app.py's "direct mode"
	cli_args = {
	'task': arguments.get('task', 'redact'),
	'input_file': input_file_path,
	'output_dir': OUTPUT_DIR,
	'input_dir': INPUT_DIR,
	'language': arguments.get('language', 'en_core_web_lg'),
	'pii_detector': arguments.get('pii_detector', 'Local'), # Default to local
	'username': arguments.get('username', 'lambda_user'),
	'save_to_user_folders': arguments.get('save_to_user_folders', 'False'),
	'ocr_method': arguments.get('ocr_method', 'Tesseract OCR - all PDF types'),
	'page_min': int(arguments.get('page_min', 0)),
	'page_max': int(arguments.get('page_max', 0)),
	'handwrite_signature_extraction': arguments.get('handwrite_signature_checkbox', ['Extract handwriting', 'Extract signatures']),
	'extract_forms': arguments.get('extract_forms', False),
	'extract_tables': arguments.get('extract_tables', False),
	'extract_layout': arguments.get('extract_layout', False),

	# General arguments
	'local_redact_entities': arguments.get('local_redact_entities', []),
	'aws_redact_entities': arguments.get('aws_redact_entities', []),
	'cost_code': arguments.get('cost_code', ''),
	'save_logs_to_csv': arguments.get('save_logs_to_csv', 'False'),
	'save_logs_to_dynamodb': arguments.get('save_logs_to_dynamodb', 'False'),
	'display_file_names_in_logs': arguments.get('display_file_names_in_logs', 'True'),
	'upload_logs_to_s3': arguments.get('upload_logs_to_s3', 'False'),
	's3_logs_prefix': arguments.get('s3_logs_prefix', ''),
	'do_initial_clean': arguments.get('do_initial_clean', 'False'),

	# PDF/Image specific arguments
	'images_dpi': float(arguments.get('images_dpi', 300.0)),
	'chosen_local_ocr_model': arguments.get('chosen_local_ocr_model', 'tesseract'),
	'preprocess_local_ocr_images': arguments.get('preprocess_local_ocr_images', 'False'),

	# Handle optional files like allow/deny lists
	'allow_list_file': arguments.get('allow_list_file', ""),
	'deny_list_file': arguments.get('deny_list_file', ""),
	'redact_whole_page_file': arguments.get('redact_whole_page_file', ""),

	# Tabular/Anonymisation arguments
	'excel_sheets': arguments.get('excel_sheets', []),
	'fuzzy_mistakes': int(arguments.get('fuzzy_mistakes', 0)),
	'match_fuzzy_whole_phrase_bool': arguments.get('match_fuzzy_whole_phrase_bool', 'True'),

	# Deduplication specific arguments
	'duplicate_type': arguments.get('duplicate_type', 'pages'),
	'similarity_threshold': float(arguments.get('similarity_threshold', 0.95)),
	'min_word_count': int(arguments.get('min_word_count', 3)),
	'min_consecutive_pages': int(arguments.get('min_consecutive_pages', 1)),
	'greedy_match': arguments.get('greedy_match', 'False'),
	'combine_pages': arguments.get('combine_pages', 'True'),
	'search_query': arguments.get('search_query', ""),
	'text_columns': arguments.get('text_columns', []),
	'remove_duplicate_rows': arguments.get('remove_duplicate_rows', 'True'),
	'anon_strategy': arguments.get('anon_strategy', 'redact'),

	# Textract specific arguments
	'textract_action': arguments.get('textract_action', ''),
	'job_id': arguments.get('job_id', ''),
	'extract_signatures': arguments.get('extract_signatures', False),
	'textract_bucket': arguments.get('textract_bucket', ''),
	'textract_input_prefix': arguments.get('textract_input_prefix', ''),
	'textract_output_prefix': arguments.get('textract_output_prefix', ''),
	's3_textract_document_logs_subfolder': arguments.get('s3_textract_document_logs_subfolder', ''),
	'local_textract_document_logs_subfolder': arguments.get('local_textract_document_logs_subfolder', ''),
	'poll_interval': int(arguments.get('poll_interval', 30)),
	'max_poll_attempts': int(arguments.get('max_poll_attempts', 120)),

	# AWS credentials (use IAM Role instead of keys)
	'aws_access_key': None,
	'aws_secret_key': None,
	'aws_region': os.getenv("AWS_REGION", ""),
	's3_bucket': bucket_name,

	# Set defaults for boolean flags
	'prepare_images': arguments.get('prepare_images', True),
	'compress_redacted_pdf': arguments.get('compress_redacted_pdf', False),
	'return_pdf_end_of_redaction': arguments.get('return_pdf_end_of_redaction', True)
	}

	# Combine extraction options
	extraction_options = list(cli_args['handwrite_signature_extraction']) if cli_args['handwrite_signature_extraction'] else []
	if cli_args['extract_forms']:
	extraction_options.append('Extract forms')
	if cli_args['extract_tables']:
	extraction_options.append('Extract tables')
	if cli_args['extract_layout']:
	extraction_options.append('Extract layout')
	cli_args['handwrite_signature_extraction'] = extraction_options

	# Download optional files if they are specified
	allow_list_key = arguments.get('allow_list_file')
	if allow_list_key:
	allow_list_path = os.path.join(INPUT_DIR, 'allow_list.csv')
	download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
	cli_args['allow_list_file'] = allow_list_path

	deny_list_key = arguments.get('deny_list_file')
	if deny_list_key:
	deny_list_path = os.path.join(INPUT_DIR, 'deny_list.csv')
	download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
	cli_args['deny_list_file'] = deny_list_path

	# 5. Execute the main application logic
	try:
	print("--- Starting CLI Redact Main Function ---")
	print(f"Arguments passed to cli_main: {cli_args}")
	cli_main(direct_mode_args=cli_args)
	print("--- CLI Redact Main Function Finished ---")
	except Exception as e:
	print(f"An error occurred during CLI execution: {e}")
	# Optionally, re-raise the exception to make the Lambda fail
	raise

	# 6. Upload results back to S3
	output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
	print(f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/")
	upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)

	return {
	"statusCode": 200,
	"body": json.dumps(f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/")
	}