Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / tools /load_spacy_model_custom_recognisers.py

Refactor redaction functionality and enhance UI components: Added support for custom recognizers and whole page redaction options. Updated file handling to include new dropdowns for entity selection and improved dataframes for entity management. Enhanced the annotator with better state management and UI responsiveness. Cleaned up redundant code and improved overall performance in the redaction process.

1d772de 3 months ago

raw

history blame

6.57 kB

	# %%
	from typing import List
	from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
	from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
	import spacy
	spacy.prefer_gpu()
	from spacy.cli.download import download
	import re

	# %%
	model_name = "en_core_web_sm" #"en_core_web_trf"
	score_threshold = 0.001
	custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]

	#Load spacy model
	try:
	import en_core_web_sm
	nlp = en_core_web_sm.load()
	print("Successfully imported spaCy model")

	except:
	download(model_name)
	nlp = spacy.load(model_name)
	print("Successfully downloaded and imported spaCy model", model_name)

	# #### Custom recognisers
	# Allow user to create their own recogniser
	def custom_word_list_recogniser(custom_list:List[str]=[]):
	custom_regex = '\\b' + '\\b\|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
	custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)

	#print("custom_pattern:", custom_pattern)
	custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
	global_regex_flags=re.DOTALL \| re.MULTILINE \| re.IGNORECASE)

	return custom_recogniser

	# Initialise custom recogniser that will be overwritten later
	custom_recogniser = custom_word_list_recogniser()

	# Custom title recogniser
	titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
	titles_regex = '\\b' + '\\b\|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
	titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
	titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES", patterns = [titles_pattern],
	global_regex_flags=re.DOTALL \| re.MULTILINE)

	# %%
	# Custom postcode recogniser

	# Define the regex pattern in a Presidio `Pattern` object:
	ukpostcode_pattern = Pattern(
	name="ukpostcode_pattern",
	regex=r"\b([A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\|GIR ?0AA)\b",
	score=1
	)

	# Define the recognizer with one or more patterns
	ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])

	# %%
	# Examples for testing

	#text = "I live in 510 Broad st SE5 9NG ."

	#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
	#print("Result:")
	#print(numbers_result)

	# %%
	def extract_street_name(text:str) -> str:
	"""
	Extracts the street name and preceding word (that should contain at least one number) from the given text.

	"""

	street_types = [
	'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
	'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
	'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
	'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
	'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
	'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
	'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
	'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
	]

	# Dynamically construct the regex pattern with all possible street types
	street_types_pattern = '\|'.join(rf"{re.escape(street_type)}" for street_type in street_types)

	# The overall regex pattern to capture the street name and preceding word(s)

	pattern = rf'(?P<preceding_word>\w\d\w)\s*'
	pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'

	# Find all matches in text
	matches = re.finditer(pattern, text, re.IGNORECASE)

	start_positions = []
	end_positions = []

	for match in matches:
	preceding_word = match.group('preceding_word').strip()
	street_name = match.group('street_name').strip()
	start_pos = match.start()
	end_pos = match.end()
	#print(f"Start: {start_pos}, End: {end_pos}")
	#print(f"Preceding words: {preceding_word}")
	#print(f"Street name: {street_name}")

	start_positions.append(start_pos)
	end_positions.append(end_pos)

	return start_positions, end_positions


	# %%
	# Some examples for testing

	#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
	#text = "Roberto lives in Five 10 Broad st in Oregon"
	#text = "Roberto lives in 55 Oregon Square"
	#text = "There is 51a no way I will do that"
	#text = "I am writing to apply for"

	#extract_street_name(text)

	# %%
	class StreetNameRecognizer(EntityRecognizer):

	def load(self) -> None:
	"""No loading is required."""
	pass

	def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
	"""
	Logic for detecting a specific PII
	"""

	start_pos, end_pos = extract_street_name(text)

	results = []

	for i in range(0, len(start_pos)):

	result = RecognizerResult(
	entity_type="STREETNAME",
	start = start_pos[i],
	end = end_pos[i],
	score= 1
	)

	results.append(result)

	return results

	street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])

	# Create a class inheriting from SpacyNlpEngine
	class LoadedSpacyNlpEngine(SpacyNlpEngine):
	def __init__(self, loaded_spacy_model):
	super().__init__()
	self.nlp = {"en": loaded_spacy_model}



	# Pass the loaded model to the new LoadedSpacyNlpEngine
	loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)


	nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
	default_score_threshold=score_threshold,
	supported_languages=["en"],
	log_decision_process=False,
	)

	# Add custom recognisers to nlp_analyser
	nlp_analyser.registry.add_recognizer(street_recogniser)
	nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
	nlp_analyser.registry.add_recognizer(titles_recogniser)
	nlp_analyser.registry.add_recognizer(custom_recogniser)