Commit
·
4c95b3c
1
Parent(s):
f188b10
Fix for fuzzy matching
Browse files- tools/file_redaction.py +5 -3
- tools/load_spacy_model_custom_recognisers.py +111 -111
tools/file_redaction.py
CHANGED
|
@@ -468,10 +468,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 468 |
### Language check - check if selected language packs exist
|
| 469 |
try:
|
| 470 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
|
| 471 |
-
|
| 472 |
-
|
|
|
|
| 473 |
|
| 474 |
-
|
|
|
|
| 475 |
load_spacy_model(language)
|
| 476 |
|
| 477 |
except Exception as e:
|
|
|
|
| 468 |
### Language check - check if selected language packs exist
|
| 469 |
try:
|
| 470 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
|
| 471 |
+
if language != "en":
|
| 472 |
+
progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
|
| 473 |
+
download_tesseract_lang_pack(language)
|
| 474 |
|
| 475 |
+
if language != "en":
|
| 476 |
+
progress(0.1, desc=f"Loading SpaCy model for {language}")
|
| 477 |
load_spacy_model(language)
|
| 478 |
|
| 479 |
except Exception as e:
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
|
@@ -396,113 +396,6 @@ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
|
|
| 396 |
|
| 397 |
return start_positions, end_positions
|
| 398 |
|
| 399 |
-
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
|
| 400 |
-
''' Conduct fuzzy match on a list of text data.'''
|
| 401 |
-
|
| 402 |
-
all_matches = []
|
| 403 |
-
all_start_positions = []
|
| 404 |
-
all_end_positions = []
|
| 405 |
-
all_ratios = []
|
| 406 |
-
|
| 407 |
-
#print("custom_query_list:", custom_query_list)
|
| 408 |
-
|
| 409 |
-
if not text:
|
| 410 |
-
out_message = "No text data found. Skipping page."
|
| 411 |
-
print(out_message)
|
| 412 |
-
return all_start_positions, all_end_positions
|
| 413 |
-
|
| 414 |
-
for string_query in custom_query_list:
|
| 415 |
-
|
| 416 |
-
#print("text:", text)
|
| 417 |
-
#print("string_query:", string_query)
|
| 418 |
-
|
| 419 |
-
query = nlp(string_query)
|
| 420 |
-
|
| 421 |
-
if search_whole_phrase == False:
|
| 422 |
-
# Keep only words that are not stop words
|
| 423 |
-
token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
|
| 424 |
-
|
| 425 |
-
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
| 426 |
-
|
| 427 |
-
#print("token_query:", token_query)
|
| 428 |
-
|
| 429 |
-
if len(token_query) > 1:
|
| 430 |
-
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
| 431 |
-
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
| 432 |
-
else:
|
| 433 |
-
#pattern_lemma = [{"LEMMA": query[0]}]
|
| 434 |
-
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
|
| 435 |
-
|
| 436 |
-
matcher = Matcher(nlp.vocab)
|
| 437 |
-
matcher.add(string_query, [pattern_fuzz])
|
| 438 |
-
#matcher.add(string_query, [pattern_lemma])
|
| 439 |
-
|
| 440 |
-
else:
|
| 441 |
-
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
| 442 |
-
#tokenised_query = [string_query.lower()]
|
| 443 |
-
# If you want to match the whole phrase, use phrase matcher
|
| 444 |
-
matcher = FuzzyMatcher(nlp.vocab)
|
| 445 |
-
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
| 446 |
-
matcher.add("PHRASE", patterns, [{"ignore_case": True}])
|
| 447 |
-
|
| 448 |
-
batch_size = 256
|
| 449 |
-
docs = nlp.pipe([text], batch_size=batch_size)
|
| 450 |
-
|
| 451 |
-
# Get number of matches per doc
|
| 452 |
-
for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
| 453 |
-
matches = matcher(doc)
|
| 454 |
-
match_count = len(matches)
|
| 455 |
-
|
| 456 |
-
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
|
| 457 |
-
if search_whole_phrase==False:
|
| 458 |
-
all_matches.append(match_count)
|
| 459 |
-
|
| 460 |
-
for match_id, start, end in matches:
|
| 461 |
-
span = str(doc[start:end]).strip()
|
| 462 |
-
query_search = str(query).strip()
|
| 463 |
-
#print("doc:", doc)
|
| 464 |
-
#print("span:", span)
|
| 465 |
-
#print("query_search:", query_search)
|
| 466 |
-
|
| 467 |
-
# Convert word positions to character positions
|
| 468 |
-
start_char = doc[start].idx # Start character position
|
| 469 |
-
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
| 470 |
-
|
| 471 |
-
# The positions here are word position, not character position
|
| 472 |
-
all_matches.append(match_count)
|
| 473 |
-
all_start_positions.append(start_char)
|
| 474 |
-
all_end_positions.append(end_char)
|
| 475 |
-
|
| 476 |
-
else:
|
| 477 |
-
for match_id, start, end, ratio, pattern in matches:
|
| 478 |
-
span = str(doc[start:end]).strip()
|
| 479 |
-
query_search = str(query).strip()
|
| 480 |
-
#print("doc:", doc)
|
| 481 |
-
#print("span:", span)
|
| 482 |
-
#print("query_search:", query_search)
|
| 483 |
-
|
| 484 |
-
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
| 485 |
-
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
| 486 |
-
|
| 487 |
-
#print("Levenshtein distance:", distance)
|
| 488 |
-
|
| 489 |
-
if distance > spelling_mistakes_max:
|
| 490 |
-
match_count = match_count - 1
|
| 491 |
-
else:
|
| 492 |
-
# Convert word positions to character positions
|
| 493 |
-
start_char = doc[start].idx # Start character position
|
| 494 |
-
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
| 495 |
-
|
| 496 |
-
#print("start_char:", start_char)
|
| 497 |
-
#print("end_char:", end_char)
|
| 498 |
-
|
| 499 |
-
all_matches.append(match_count)
|
| 500 |
-
all_start_positions.append(start_char)
|
| 501 |
-
all_end_positions.append(end_char)
|
| 502 |
-
all_ratios.append(ratio)
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
return all_start_positions, all_end_positions
|
| 506 |
|
| 507 |
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
| 508 |
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
|
@@ -537,13 +430,11 @@ class CustomWordFuzzyRecognizer(EntityRecognizer):
|
|
| 537 |
custom_list_default = []
|
| 538 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
| 539 |
|
| 540 |
-
|
| 541 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
| 542 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
|
| 543 |
|
| 544 |
-
|
| 545 |
def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
|
| 546 |
-
spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None):
|
| 547 |
"""
|
| 548 |
Create an nlp_analyser object based on the specified language input.
|
| 549 |
|
|
@@ -552,6 +443,8 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
|
|
| 552 |
custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
|
| 553 |
spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
|
| 554 |
search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
|
|
|
|
|
|
|
| 555 |
|
| 556 |
Returns:
|
| 557 |
AnalyzerEngine: Configured nlp_analyser object with custom recognizers
|
|
@@ -606,10 +499,117 @@ def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str]
|
|
| 606 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|
| 607 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
| 608 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
return nlp_analyser
|
| 611 |
|
| 612 |
# Create the default nlp_analyser using the new function
|
| 613 |
-
nlp_analyser = create_nlp_analyser(DEFAULT_LANGUAGE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
|
| 615 |
|
|
|
|
| 396 |
|
| 397 |
return start_positions, end_positions
|
| 398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
| 401 |
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
|
|
|
| 430 |
custom_list_default = []
|
| 431 |
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
| 432 |
|
|
|
|
| 433 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
| 434 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
|
| 435 |
|
|
|
|
| 436 |
def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
|
| 437 |
+
spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None, return_also_model: bool = False):
|
| 438 |
"""
|
| 439 |
Create an nlp_analyser object based on the specified language input.
|
| 440 |
|
|
|
|
| 443 |
custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
|
| 444 |
spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
|
| 445 |
search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
|
| 446 |
+
existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
|
| 447 |
+
return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
|
| 448 |
|
| 449 |
Returns:
|
| 450 |
AnalyzerEngine: Configured nlp_analyser object with custom recognizers
|
|
|
|
| 499 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|
| 500 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
| 501 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
| 502 |
+
|
| 503 |
+
if return_also_model:
|
| 504 |
+
return nlp_analyser, nlp_model
|
| 505 |
|
| 506 |
return nlp_analyser
|
| 507 |
|
| 508 |
# Create the default nlp_analyser using the new function
|
| 509 |
+
nlp_analyser, nlp_model = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
|
| 510 |
+
|
| 511 |
+
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp_model, progress=gr.Progress(track_tqdm=True)):
|
| 512 |
+
''' Conduct fuzzy match on a list of text data.'''
|
| 513 |
+
|
| 514 |
+
all_matches = []
|
| 515 |
+
all_start_positions = []
|
| 516 |
+
all_end_positions = []
|
| 517 |
+
all_ratios = []
|
| 518 |
+
|
| 519 |
+
#print("custom_query_list:", custom_query_list)
|
| 520 |
+
|
| 521 |
+
if not text:
|
| 522 |
+
out_message = "No text data found. Skipping page."
|
| 523 |
+
print(out_message)
|
| 524 |
+
return all_start_positions, all_end_positions
|
| 525 |
+
|
| 526 |
+
for string_query in custom_query_list:
|
| 527 |
+
|
| 528 |
+
query = nlp(string_query)
|
| 529 |
+
|
| 530 |
+
if search_whole_phrase == False:
|
| 531 |
+
# Keep only words that are not stop words
|
| 532 |
+
token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
|
| 533 |
+
|
| 534 |
+
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
| 535 |
+
|
| 536 |
+
if len(token_query) > 1:
|
| 537 |
+
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
| 538 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
| 539 |
+
else:
|
| 540 |
+
#pattern_lemma = [{"LEMMA": query[0]}]
|
| 541 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
|
| 542 |
+
|
| 543 |
+
matcher = Matcher(nlp.vocab)
|
| 544 |
+
matcher.add(string_query, [pattern_fuzz])
|
| 545 |
+
#matcher.add(string_query, [pattern_lemma])
|
| 546 |
+
|
| 547 |
+
else:
|
| 548 |
+
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
| 549 |
+
#tokenised_query = [string_query.lower()]
|
| 550 |
+
# If you want to match the whole phrase, use phrase matcher
|
| 551 |
+
matcher = FuzzyMatcher(nlp.vocab)
|
| 552 |
+
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
| 553 |
+
matcher.add("PHRASE", patterns, [{"ignore_case": True}])
|
| 554 |
+
|
| 555 |
+
batch_size = 256
|
| 556 |
+
docs = nlp.pipe([text], batch_size=batch_size)
|
| 557 |
+
|
| 558 |
+
# Get number of matches per doc
|
| 559 |
+
for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
| 560 |
+
matches = matcher(doc)
|
| 561 |
+
match_count = len(matches)
|
| 562 |
+
|
| 563 |
+
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
|
| 564 |
+
if search_whole_phrase==False:
|
| 565 |
+
all_matches.append(match_count)
|
| 566 |
+
|
| 567 |
+
for match_id, start, end in matches:
|
| 568 |
+
span = str(doc[start:end]).strip()
|
| 569 |
+
query_search = str(query).strip()
|
| 570 |
+
#print("doc:", doc)
|
| 571 |
+
#print("span:", span)
|
| 572 |
+
#print("query_search:", query_search)
|
| 573 |
+
|
| 574 |
+
# Convert word positions to character positions
|
| 575 |
+
start_char = doc[start].idx # Start character position
|
| 576 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
| 577 |
+
|
| 578 |
+
# The positions here are word position, not character position
|
| 579 |
+
all_matches.append(match_count)
|
| 580 |
+
all_start_positions.append(start_char)
|
| 581 |
+
all_end_positions.append(end_char)
|
| 582 |
+
|
| 583 |
+
else:
|
| 584 |
+
for match_id, start, end, ratio, pattern in matches:
|
| 585 |
+
span = str(doc[start:end]).strip()
|
| 586 |
+
query_search = str(query).strip()
|
| 587 |
+
#print("doc:", doc)
|
| 588 |
+
#print("span:", span)
|
| 589 |
+
#print("query_search:", query_search)
|
| 590 |
+
|
| 591 |
+
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
| 592 |
+
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
| 593 |
+
|
| 594 |
+
#print("Levenshtein distance:", distance)
|
| 595 |
+
|
| 596 |
+
if distance > spelling_mistakes_max:
|
| 597 |
+
match_count = match_count - 1
|
| 598 |
+
else:
|
| 599 |
+
# Convert word positions to character positions
|
| 600 |
+
start_char = doc[start].idx # Start character position
|
| 601 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
| 602 |
+
|
| 603 |
+
#print("start_char:", start_char)
|
| 604 |
+
#print("end_char:", end_char)
|
| 605 |
+
|
| 606 |
+
all_matches.append(match_count)
|
| 607 |
+
all_start_positions.append(start_char)
|
| 608 |
+
all_end_positions.append(end_char)
|
| 609 |
+
all_ratios.append(ratio)
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
return all_start_positions, all_end_positions
|
| 613 |
+
|
| 614 |
|
| 615 |
|