File size: 2,401 Bytes
8b414b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""This file contains functions that can generates hand-crafted features from the text"""

__all__ = [
    'count_words',
    'count_punctuation',
    'count_how_many_words_are_repeating',
    'count_misspelled_words',
    'preprocess_test'
]

import re
from collections import Counter
from string import punctuation
from typing import Dict

from src.spell_checker import SmartSpellChecker

underscores_to_replace = {
    'Generic_Name': 'name',
    'OTHER_NAME': 'name',
    'STUDENT_NAME': 'name',
    'Generic_Namea': 'name',
    'PROPER_NAME': 'proper name',
    'PROEPR_NAME': 'proper name ',
    'Generic_School': 'school',
    'SCHOOL_NAME': 'school',
    'Generic_school': 'school',
    'TEACHER_NAME': 'teacher',
    'Generic_City': 'city',
    'LOCATION_NAME': 'location',
    'STORE_NAME': 'store',
    'RESTAURANT_NAME': 'restaurant',
    'LANGUAGE_NAME': 'language',
}


def preprocess_test(text: str) -> str:
    # Removes digits, special signs, double spaces and tabulation, underscores
    for key, value in underscores_to_replace.items():
        text = text.replace(key, value)

    text = re.sub(r"[\d%@\\#$&^\"_()*+\-/]", " ", text)
    text = re.sub(r"\n|\t", " ", text)
    text = re.sub(r'(?<=[.,:;!?])(?=\S)', " ", text)  # Add space after punctuation
    text = re.sub(r"\s+", " ", text)

    return text


def get_word_counter(text: str) -> Dict[str, int]:
    # removes punctuation and count words in sentence
    text = re.sub(r"[.,!?;:]", " ", text)

    return Counter(text.split())


def count_punctuation(text: str) -> Dict[str, int]:
    features = {}

    for symbol in (punctuation + " "):
        features[f'count_{symbol}'] = text.count(symbol)

    return features


def count_how_many_words_are_repeating(text: str) -> Dict[str, int]:
    word_count = get_word_counter(text)
    features = {}

    # For each text count how many unique words repeated >= times
    for ii in range(3, 10):
        n_words_repeated = len([word for word in word_count if word_count[word] >= ii])
        features[f'{ii}_word_repeated'] = n_words_repeated

    return features


def count_misspelled_words(text: str, spellcheck: SmartSpellChecker) -> Dict[str, int]:
    unknown_words = spellcheck.unknown(get_word_counter(text))
    return {'n_misspelled_words': len(unknown_words)}


def count_words(text: str) -> Dict[str, int]:
    return {'length': len(get_word_counter(text))}