File size: 2,972 Bytes
8b414b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from collections import defaultdict
from typing import Dict, List
import nltk
import numpy as np
import pandas as pd
from src.feature_extractors.base_extractor import BaseExtractor
class TermFrequencyFeatureExtractor(BaseExtractor):
"""Build a dataframe with a distribution of term frequencies
Usage:
>>> data = pd.read_csv("data/raw/train.csv").set_index("text_id")
>>> featurizer = TermFrequencyFeaturizer()
>>> X = featurizer.featurize(data.full_text)
>>> y = data["vocabulary"]
>>> model = catboost.CatBoostRegressor()
>>> model.fit(x_train, y_train)
Possible improvements:
- Add word corrections: triying -> trying
- Count not only word frequencies, but number of unique words in each hist bin
"""
MAX_TERM_FREQUENCY = 23135751162 # it's so in the term frequency dataset
def __init__(self, n_bins: int = 40):
self.term2freq: Dict[str, int] = self._load_term2freq_dict()
self.bins = self._make_bins(n_bins)
self.feature_names = [
f"bin_{round(self.bins[i], 1)}_{round(self.bins[i+1], 1)}"
for i in range(len(self.bins) - 1)
]
nltk.download("punkt")
def _make_bins(self, n_bins: int) -> np.ndarray:
min_bin = 0
max_bin = np.log1p(self.MAX_TERM_FREQUENCY)
bins = np.linspace(min_bin, max_bin, n_bins)
return bins
def _load_term2freq_dict(self) -> Dict[str, int]:
term_frequencies = pd.read_csv("data/word_frequencies/unigram_freq.csv")
term2freq: Dict[str, int] = defaultdict(lambda: 0)
term2freq.update(term_frequencies.set_index("word").to_dict()["count"])
return term2freq
def generate_features(self, data: pd.Series) -> pd.DataFrame:
"""Extracts features from the text in the form of histogram of word frequencies
Logarithm operation is applied to the frequencies for the sake of distribution
normality.
"""
feature_df = data.apply(self._compute_word_frequency_histogram)
feature_df.columns = self.feature_names
return feature_df
def _compute_word_frequency_histogram(self, text: str) -> pd.Series:
term_frequencies: List[int] = self._compute_term_frequencies_from_text(text)
histogram_values: np.ndarray = self._build_histogram(term_frequencies)
return pd.Series(histogram_values)
def _compute_term_frequencies_from_text(self, text: str) -> List[int]:
tokens = nltk.tokenize.word_tokenize(text)
words = [token.lower() for token in tokens if token.isalpha()]
word_frequencies = [self.term2freq[word] for word in words]
return word_frequencies
def _build_histogram(self, values: List[int]) -> np.ndarray:
values_log = np.log1p(values)
histogram, __ = np.histogram(values_log, bins=self.bins)
normalized_histogram = histogram / len(values)
return normalized_histogram
|