File size: 2,099 Bytes
8b414b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
__all__ = ['SmartSpellChecker']
import re
from collections import Counter
from functools import lru_cache
from typing import Iterable
from spellchecker import SpellChecker
def get_word_counter(text: str):
# removes punctuation and count words in sentence
text = re.sub(r"[.,!?;:]", " ", text)
return Counter(text.split())
custom_mappings = {
'alot': 'a lot',
'classwork': 'class work',
'everytime': 'every time',
'loosing': 'losing',
'clases': 'classes',
'payed': 'paid',
'learnd': 'learned',
'ect': 'etc',
'wasnt': "wasn't",
'wich': 'which',
"sol's": 'souls',
'thigs': 'things',
'activies': 'activities',
'oline': 'online',
'thru': 'through',
'inconclusion': 'in conclusion',
}
skipped_mappings = {
' u ': ' you ',
'youll': "you will",
'wont': "won't"}
exclude_words_from_check = {
"you're", 'covid'
}
black_list = {'ther', "waldo's", "f's", ""}
class SmartSpellChecker:
def __init__(self):
self.spellcheck = SpellChecker()
@lru_cache(maxsize=None)
def correct_word(self, mismatch: str):
if mismatch in custom_mappings:
return custom_mappings[mismatch]
if mismatch in black_list:
return ""
if mismatch in exclude_words_from_check:
return None
# sometimes spellcheck thinks 'b' or 'c' if misspelled words
# this condition > 2 is needed
if len(mismatch) <= 2:
return None
return self.spellcheck.correction(mismatch)
def correct_text(self, text: str):
for key, value in skipped_mappings.items():
if key in text:
text = text.replace(key, value)
word_count = get_word_counter(text)
unknown_words = self.unknown(word_count)
for misspell in unknown_words:
correct = self.correct_word(misspell)
if correct is not None:
text = text.replace(misspell, correct)
return text
def unknown(self, words: Iterable):
return self.spellcheck.unknown(words)
|