Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / lambda_entrypoint.py

Fixed on deprecated Github workflow functions. Applied linter and formatter to code throughout. Added tests for GUI load.

bafcf39 about 2 months ago

raw

history blame

10.3 kB

	import json
	import os

	import boto3

	# Import the main function from your CLI script
	from cli_redact import main as cli_main

	print("Lambda entrypoint loading...")

	# Initialize S3 client outside the handler for connection reuse
	s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION", "eu-west-2"))
	print("S3 client initialised")

	# Lambda's only writable directory
	TMP_DIR = "/tmp"
	INPUT_DIR = os.path.join(TMP_DIR, "input")
	OUTPUT_DIR = os.path.join(TMP_DIR, "output")


	def download_file_from_s3(bucket_name, key, download_path):
	"""Download a file from S3 to the local filesystem."""
	try:
	s3_client.download_file(bucket_name, key, download_path)
	print(f"Successfully downloaded s3://{bucket_name}/{key} to {download_path}")
	except Exception as e:
	print(f"Error downloading from S3: {e}")
	raise


	def upload_directory_to_s3(local_directory, bucket_name, s3_prefix):
	"""Upload all files from a local directory to an S3 prefix."""
	for root, _, files in os.walk(local_directory):
	for file_name in files:
	local_file_path = os.path.join(root, file_name)
	# Create a relative path to maintain directory structure if needed
	relative_path = os.path.relpath(local_file_path, local_directory)
	output_key = os.path.join(s3_prefix, relative_path)

	try:
	s3_client.upload_file(local_file_path, bucket_name, output_key)
	print(
	f"Successfully uploaded {local_file_path} to s3://{bucket_name}/{output_key}"
	)
	except Exception as e:
	print(f"Error uploading to S3: {e}")
	raise


	def lambda_handler(event, context):
	print(f"Received event: {json.dumps(event)}")

	# 1. Setup temporary directories
	os.makedirs(INPUT_DIR, exist_ok=True)
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# 2. Extract information from the event
	# Assumes the event is triggered by S3 and may contain an 'arguments' payload
	try:
	record = event["Records"][0]
	bucket_name = record["s3"]["bucket"]["name"]
	input_key = record["s3"]["object"]["key"]

	# The user metadata can be used to pass arguments
	# This is more robust than embedding them in the main event body
	response = s3_client.head_object(Bucket=bucket_name, Key=input_key)
	metadata = response.get("Metadata", {})
	# Arguments can be passed as a JSON string in metadata
	arguments = json.loads(metadata.get("arguments", "{}"))

	except (KeyError, IndexError) as e:
	print(
	f"Could not parse S3 event record: {e}. Checking for direct invocation payload."
	)
	# Fallback for direct invocation (e.g., from Step Functions or manual test)
	bucket_name = event.get("bucket_name")
	input_key = event.get("input_key")
	arguments = event.get("arguments", {})
	if not all([bucket_name, input_key]):
	raise ValueError(
	"Missing 'bucket_name' or 'input_key' in direct invocation event."
	)

	print(f"Processing s3://{bucket_name}/{input_key}")
	print(f"With arguments: {arguments}")

	# 3. Download the main input file
	input_file_path = os.path.join(INPUT_DIR, os.path.basename(input_key))
	download_file_from_s3(bucket_name, input_key, input_file_path)

	# 4. Prepare arguments for the CLI function
	# This dictionary should mirror the one in your app.py's "direct mode"
	cli_args = {
	"task": arguments.get("task", "redact"),
	"input_file": input_file_path,
	"output_dir": OUTPUT_DIR,
	"input_dir": INPUT_DIR,
	"language": arguments.get("language", "en_core_web_lg"),
	"pii_detector": arguments.get("pii_detector", "Local"), # Default to local
	"username": arguments.get("username", "lambda_user"),
	"save_to_user_folders": arguments.get("save_to_user_folders", "False"),
	"ocr_method": arguments.get("ocr_method", "Tesseract OCR - all PDF types"),
	"page_min": int(arguments.get("page_min", 0)),
	"page_max": int(arguments.get("page_max", 0)),
	"handwrite_signature_extraction": arguments.get(
	"handwrite_signature_checkbox",
	["Extract handwriting", "Extract signatures"],
	),
	"extract_forms": arguments.get("extract_forms", False),
	"extract_tables": arguments.get("extract_tables", False),
	"extract_layout": arguments.get("extract_layout", False),
	# General arguments
	"local_redact_entities": arguments.get("local_redact_entities", []),
	"aws_redact_entities": arguments.get("aws_redact_entities", []),
	"cost_code": arguments.get("cost_code", ""),
	"save_logs_to_csv": arguments.get("save_logs_to_csv", "False"),
	"save_logs_to_dynamodb": arguments.get("save_logs_to_dynamodb", "False"),
	"display_file_names_in_logs": arguments.get(
	"display_file_names_in_logs", "True"
	),
	"upload_logs_to_s3": arguments.get("upload_logs_to_s3", "False"),
	"s3_logs_prefix": arguments.get("s3_logs_prefix", ""),
	"do_initial_clean": arguments.get("do_initial_clean", "False"),
	# PDF/Image specific arguments
	"images_dpi": float(arguments.get("images_dpi", 300.0)),
	"chosen_local_ocr_model": arguments.get("chosen_local_ocr_model", "tesseract"),
	"preprocess_local_ocr_images": arguments.get(
	"preprocess_local_ocr_images", "False"
	),
	# Handle optional files like allow/deny lists
	"allow_list_file": arguments.get("allow_list_file", ""),
	"deny_list_file": arguments.get("deny_list_file", ""),
	"redact_whole_page_file": arguments.get("redact_whole_page_file", ""),
	# Tabular/Anonymisation arguments
	"excel_sheets": arguments.get("excel_sheets", []),
	"fuzzy_mistakes": int(arguments.get("fuzzy_mistakes", 0)),
	"match_fuzzy_whole_phrase_bool": arguments.get(
	"match_fuzzy_whole_phrase_bool", "True"
	),
	# Deduplication specific arguments
	"duplicate_type": arguments.get("duplicate_type", "pages"),
	"similarity_threshold": float(arguments.get("similarity_threshold", 0.95)),
	"min_word_count": int(arguments.get("min_word_count", 3)),
	"min_consecutive_pages": int(arguments.get("min_consecutive_pages", 1)),
	"greedy_match": arguments.get("greedy_match", "False"),
	"combine_pages": arguments.get("combine_pages", "True"),
	"search_query": arguments.get("search_query", ""),
	"text_columns": arguments.get("text_columns", []),
	"remove_duplicate_rows": arguments.get("remove_duplicate_rows", "True"),
	"anon_strategy": arguments.get("anon_strategy", "redact"),
	# Textract specific arguments
	"textract_action": arguments.get("textract_action", ""),
	"job_id": arguments.get("job_id", ""),
	"extract_signatures": arguments.get("extract_signatures", False),
	"textract_bucket": arguments.get("textract_bucket", ""),
	"textract_input_prefix": arguments.get("textract_input_prefix", ""),
	"textract_output_prefix": arguments.get("textract_output_prefix", ""),
	"s3_textract_document_logs_subfolder": arguments.get(
	"s3_textract_document_logs_subfolder", ""
	),
	"local_textract_document_logs_subfolder": arguments.get(
	"local_textract_document_logs_subfolder", ""
	),
	"poll_interval": int(arguments.get("poll_interval", 30)),
	"max_poll_attempts": int(arguments.get("max_poll_attempts", 120)),
	# AWS credentials (use IAM Role instead of keys)
	"aws_access_key": None,
	"aws_secret_key": None,
	"aws_region": os.getenv("AWS_REGION", ""),
	"s3_bucket": bucket_name,
	# Set defaults for boolean flags
	"prepare_images": arguments.get("prepare_images", True),
	"compress_redacted_pdf": arguments.get("compress_redacted_pdf", False),
	"return_pdf_end_of_redaction": arguments.get(
	"return_pdf_end_of_redaction", True
	),
	}

	# Combine extraction options
	extraction_options = (
	list(cli_args["handwrite_signature_extraction"])
	if cli_args["handwrite_signature_extraction"]
	else []
	)
	if cli_args["extract_forms"]:
	extraction_options.append("Extract forms")
	if cli_args["extract_tables"]:
	extraction_options.append("Extract tables")
	if cli_args["extract_layout"]:
	extraction_options.append("Extract layout")
	cli_args["handwrite_signature_extraction"] = extraction_options

	# Download optional files if they are specified
	allow_list_key = arguments.get("allow_list_file")
	if allow_list_key:
	allow_list_path = os.path.join(INPUT_DIR, "allow_list.csv")
	download_file_from_s3(bucket_name, allow_list_key, allow_list_path)
	cli_args["allow_list_file"] = allow_list_path

	deny_list_key = arguments.get("deny_list_file")
	if deny_list_key:
	deny_list_path = os.path.join(INPUT_DIR, "deny_list.csv")
	download_file_from_s3(bucket_name, deny_list_key, deny_list_path)
	cli_args["deny_list_file"] = deny_list_path

	# 5. Execute the main application logic
	try:
	print("--- Starting CLI Redact Main Function ---")
	print(f"Arguments passed to cli_main: {cli_args}")
	cli_main(direct_mode_args=cli_args)
	print("--- CLI Redact Main Function Finished ---")
	except Exception as e:
	print(f"An error occurred during CLI execution: {e}")
	# Optionally, re-raise the exception to make the Lambda fail
	raise

	# 6. Upload results back to S3
	output_s3_prefix = f"output/{os.path.splitext(os.path.basename(input_key))[0]}"
	print(
	f"Uploading contents of {OUTPUT_DIR} to s3://{bucket_name}/{output_s3_prefix}/"
	)
	upload_directory_to_s3(OUTPUT_DIR, bucket_name, output_s3_prefix)

	return {
	"statusCode": 200,
	"body": json.dumps(
	f"Processing complete for {input_key}. Output saved to s3://{bucket_name}/{output_s3_prefix}/"
	),
	}