Spaces:

seanpedrickcase
/

document_redaction

Sleeping

App Files Files Community

document_redaction / tools /file_redaction.py

seanpedrickcase

Added logging, anonymising all Excel sheets, simple redaction tags, some Dockerfile optimisation

01c88c0 about 1 year ago

raw

history blame

18 kB

	from PIL import Image
	from typing import List
	import pandas as pd
	from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
	from presidio_image_redactor.entities import ImageRecognizerResult
	from pdfminer.high_level import extract_pages
	from tools.file_conversion import process_file
	from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
	from pikepdf import Pdf, Dictionary, Name
	from gradio import Progress
	import time
	from collections import defaultdict # For efficient grouping

	from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
	from tools.helper_functions import get_file_path_end, output_folder
	from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
	import gradio as gr


	def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], progress=gr.Progress(track_tqdm=True)):

	tic = time.perf_counter()

	# If out message is string or out_file_paths are blank, change to a list so it can be appended to
	if isinstance(out_message, str):
	out_message = [out_message]

	if not out_file_paths:
	out_file_paths = []

	print("Latest file completed is:", str(latest_file_completed))

	latest_file_completed = int(latest_file_completed)

	# If we have already redacted the last file, return the input out_message and file list to the relevant components
	if latest_file_completed == len(file_paths):
	print("Last file reached, returning files:", str(latest_file_completed))
	final_out_message = '\n'.join(out_message)
	return final_out_message, out_file_paths, out_file_paths, latest_file_completed

	file_paths_loop = [file_paths[int(latest_file_completed)]]

	if in_allow_list:
	in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]


	print("File paths:", file_paths)

	for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
	file_path = file.name

	if file_path:
	file_path_without_ext = get_file_path_end(file_path)
	if is_pdf(file_path) == False:
	# If user has not submitted a pdf, assume it's an image
	print("File is not a pdf, assuming that image analysis needs to be used.")
	in_redact_method = "Image analysis"
	else:
	out_message = "No file selected"
	print(out_message)
	return out_message, out_file_paths, out_file_paths, latest_file_completed

	if in_redact_method == "Image analysis":
	# Analyse and redact image-based pdf or image
	# if is_pdf_or_image(file_path) == False:
	# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None

	print("Redacting file as image-based pdf")
	pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
	out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
	pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])

	out_file_paths.append(out_image_file_path)
	out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")

	# Increase latest file completed count unless we are at the last file
	if latest_file_completed != len(file_paths):
	print("Completed file number:", str(latest_file_completed))
	latest_file_completed += 1

	elif in_redact_method == "Text analysis":
	if is_pdf(file_path) == False:
	return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None

	# Analyse text-based pdf
	print('Redacting file as text-based PDF')
	pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
	out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
	pdf_text.save(out_text_file_path)

	#out_file_paths.append(out_text_file_path)
	out_message_new = "File " + file_path_without_ext + " successfully redacted."
	out_message.append(out_message_new)

	# Convert message
	convert_message="Converting PDF to image-based PDF to embed redactions."
	#progress(0.8, desc=convert_message)
	print(convert_message)

	# Convert document to image-based document to 'embed' redactions
	img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
	out_file_paths.extend(img_output_file_path)

	# Add confirmation for converting to image if you want
	# out_message.append(img_output_summary)

	if latest_file_completed != len(file_paths):
	print("Completed file number:", str(latest_file_completed))
	latest_file_completed += 1

	else:
	out_message = "No redaction method selected"
	print(out_message)
	return out_message, out_file_paths, out_file_paths, latest_file_completed


	toc = time.perf_counter()
	out_time = f"in {toc - tic:0.1f} seconds."
	print(out_time)

	out_message_out = '\n'.join(out_message)
	out_message_out = out_message_out + " " + out_time

	return out_message_out, out_file_paths, out_file_paths, latest_file_completed

	def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
	merged_bboxes = []
	grouped_bboxes = defaultdict(list)

	# 1. Group by approximate vertical proximity
	for box in bboxes:
	grouped_bboxes[round(box.top / vertical_threshold)].append(box)

	# 2. Merge within each group
	for _, group in grouped_bboxes.items():
	group.sort(key=lambda box: box.left)

	merged_box = group[0]
	for next_box in group[1:]:
	if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
	print("Merging a box")
	# Calculate new dimensions for the merged box
	new_left = min(merged_box.left, next_box.left)
	new_top = min(merged_box.top, next_box.top)
	new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
	new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
	merged_box = ImageRecognizerResult(
	merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height
	)
	else:
	merged_bboxes.append(merged_box)
	merged_box = next_box

	merged_bboxes.append(merged_box)
	return merged_bboxes

	def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
	'''
	Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
	'''
	from PIL import Image, ImageChops, ImageDraw

	fill = (0, 0, 0)

	if not image_paths:

	out_message = "PDF does not exist as images. Converting pages to image"
	print(out_message)
	#progress(0, desc=out_message)

	image_paths = process_file(file_path)

	images = []
	number_of_pages = len(image_paths)

	out_message = "Redacting pages"
	print(out_message)
	#progress(0.1, desc=out_message)

	#for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
	for i in range(0, number_of_pages):

	print("Redacting page ", str(i + 1))

	# Get the image to redact using PIL lib (pillow)
	image = image_paths[i] #Image.open(image_paths[i])

	image = ImageChops.duplicate(image)

	# %%
	image_analyser = ImageAnalyzerEngine(nlp_analyser)
	engine = ImageRedactorEngine(image_analyser)

	if language == 'en':
	ocr_lang = 'eng'
	else: ocr_lang = language

	bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
	**{
	"allow_list": allow_list,
	"language": language,
	"entities": chosen_redact_entities,
	"score_threshold": score_threshold
	})

	#print("For page: ", str(i), "Bounding boxes: ", bboxes)

	draw = ImageDraw.Draw(image)

	merged_bboxes = merge_img_bboxes(bboxes)

	print("For page: ", str(i), "Merged bounding boxes: ", merged_bboxes)

	# 3. Draw the merged boxes (unchanged)
	for box in merged_bboxes:
	x0 = box.left
	y0 = box.top
	x1 = x0 + box.width
	y1 = y0 + box.height
	draw.rectangle([x0, y0, x1, y1], fill=fill)

	images.append(image)

	return images

	def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
	'''
	Redact chosen entities from a pdf that is made up of multiple pages that are not images.
	'''

	combined_analyzer_results = []
	analyser_explanations = []
	annotations_all_pages = []
	analyzed_bounding_boxes_df = pd.DataFrame()

	# Horizontal distance between PII bounding boxes under/equal they are combined into one
	combine_pixel_dist = 100

	pdf = Pdf.open(filename)

	page_num = 0

	#for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
	for page in pdf.pages:
	print("Page number is: ", page_num + 1)

	annotations_on_page = []
	analyzed_bounding_boxes = []

	for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
	analyzer_results = []

	for text_container in page_layout:
	if isinstance(text_container, LTTextContainer):
	text_to_analyze = text_container.get_text()

	analyzer_results = []
	characters = []

	analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
	language=language,
	entities=chosen_redact_entities,
	score_threshold=score_threshold,
	return_decision_process=False,
	allow_list=allow_list)

	characters = [char # This is what we want to include in the list
	for line in text_container # Loop through each line in text_container
	if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
	for char in line] # Loop through each character in the line
	#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or


	# if len(analyzer_results) > 0 and len(characters) > 0:
	# analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
	# combined_analyzer_results.extend(analyzer_results)

	# Inside the loop where you process analyzer_results:
	if len(analyzer_results) > 0 and len(characters) > 0:
	merged_bounding_boxes = []
	current_box = None
	current_y = None

	for result in analyzer_results:
	for char in characters[result.start : result.end]:
	if isinstance(char, LTChar):
	char_box = list(char.bbox)

	# Fix: Check if either current_y or current_box are None
	if current_y is None or current_box is None:
	# This is the first character, so initialize current_box and current_y
	current_box = char_box
	current_y = char_box[1]
	else: # Now we have previous values to compare
	print("Comparing values")
	vertical_diff_bboxes = abs(char_box[1] - current_y)
	horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
	#print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)

	if (
	vertical_diff_bboxes <= 5
	and horizontal_diff_bboxes <= combine_pixel_dist
	):
	old_right_pos = current_box[2]
	current_box[2] = char_box[2]

	print("Old right pos: ", str(old_right_pos), "has been replaced with: ", str(current_box[2]), "for result: ", result)

	else:
	merged_bounding_boxes.append(
	{"boundingBox": current_box, "result": result})

	current_box = char_box
	current_y = char_box[1]
	# Add the last box
	if current_box:
	merged_bounding_boxes.append({"boundingBox": current_box, "result": result})

	if not merged_bounding_boxes:
	analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
	else:
	analyzed_bounding_boxes.extend(merged_bounding_boxes)

	combined_analyzer_results.extend(analyzer_results)

	if len(analyzer_results) > 0:
	# Create summary df of annotations to be made
	analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
	analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
	analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
	analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
	analyzed_bounding_boxes_df_new['page'] = page_num + 1
	analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)

	for analyzed_bounding_box in analyzed_bounding_boxes:
	bounding_box = analyzed_bounding_box["boundingBox"]
	annotation = Dictionary(
	Type=Name.Annot,
	Subtype=Name.Square, #Name.Highlight,
	QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
	Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
	C=[0, 0, 0],
	IC=[0, 0, 0],
	CA=1, # Transparency
	T=analyzed_bounding_box["result"].entity_type,
	BS=Dictionary(
	W=0, # Border width: 1 point
	S=Name.S # Border style: solid
	)
	)
	annotations_on_page.append(annotation)

	annotations_all_pages.extend([annotations_on_page])

	print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
	page.Annots = pdf.make_indirect(annotations_on_page)

	page_num += 1

	analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")

	return pdf