linguask / src /solutions /bert_featurizer_solution.py
GitHub Action
refs/heads/ci-cd/hugging-face
8b414b0
raw
history blame
5.05 kB
from pathlib import Path
from typing import Union
import pandas as pd
import torch.cuda
from catboost import CatBoostRegressor
from transformers import logging as transformers_logging
from src.cross_validate import CrossValidation
from src.feature_extractors.bert_pretrain_extractor import \
BertPretrainFeatureExtractor
from src.feature_extractors.text_statistics_extractor import \
HandcraftedTextFeatureExtractor
from src.solutions.base_solution import BaseSolution
from src.solutions.constant_predictor import load_train_test_df
from src.spell_checker import SmartSpellChecker
from src.text_preprocessings.spellcheck_preprocessing import \
SpellcheckTextPreprocessor
from src.utils import get_x_columns, seed_everything, validate_x, validate_y
transformers_logging.set_verbosity_error()
seed_everything()
spellcheck = SmartSpellChecker()
class BertWithHandcraftedFeaturePredictor(BaseSolution):
device = 'GPU' if torch.cuda.is_available() else None
def __init__(
self,
model_name: str,
catboost_iter: int,
saving_dir: str
):
super(BertWithHandcraftedFeaturePredictor, self).__init__()
self.feature_extractor = HandcraftedTextFeatureExtractor(spellcheck)
self.text_preprocessing = SpellcheckTextPreprocessor(spellcheck)
self.bert = BertPretrainFeatureExtractor(model_name=model_name)
# classification model for each column
self.columns = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
self.models = [
CatBoostRegressor(
iterations=catboost_iter,
task_type="CPU", # self.device,
verbose=True,
) for _ in range(len(self.columns))
]
def transform_data(self, X: pd.Series) -> pd.DataFrame:
cleaned_text = self.text_preprocessing.preprocess_data(X)
bert_features = self.bert.generate_features(cleaned_text)
bert_features = self.bert.generate_features(X)
handcrafted_features = self.feature_extractor.generate_features(X)
features_df = pd.concat([bert_features, handcrafted_features], axis='columns')
features_df = bert_features
return features_df
def fit(self, X: pd.DataFrame, y: pd.DataFrame, **kwargs) -> None:
validate_x(X)
validate_y(y)
features_df = self.transform_data(X.full_text)
for ii, column in enumerate(self.columns):
print(f"-> Training model on: {column}...")
model = self.models[ii]
target = y[column]
model.fit(X=features_df, y=target)
def predict(self, X: pd.DataFrame) -> pd.DataFrame:
validate_x(X)
features_df = self.transform_data(X.full_text)
prediction = {}
for ii, column in enumerate(self.columns):
print(f"-> Predicting model on: {column}")
model = self.models[ii]
prediction[column] = model.predict(features_df)
y_pred = pd.DataFrame(prediction, index=X.index)
y_pred['text_id'] = X.text_id
return y_pred
def save(self, directory: Union[str, Path]) -> None:
directory = Path(directory)
if not directory.exists():
directory.mkdir(parents=True)
for ii, model in enumerate(self.models):
column = self.columns[ii]
path = directory / f'catboost_{column}.cbm'
model.save_model(str(path))
def load(self, directory: Union[str, Path]) -> None:
directory = Path(directory)
if not directory.is_dir():
raise OSError(f"Dir. {directory.absolute()} does not exist")
for ii, model in enumerate(self.models):
column = self.columns[ii]
path = directory / f'catboost_{column}.cbm'
model.load_model(str(path))
def main():
config = {
'model_name': 'bert-base-uncased',
'catboost_iter': 5000,
'n_splits': 5,
'saving_dir': 'checkpoints/BertWithHandcraftedFeaturePredictor',
}
saving_dir = Path("checkpoints/bert_featurizer_solution")
saving_dir.mkdir(exist_ok=True, parents=True)
train_df, test_df = load_train_test_df()
# train_df = train_df.iloc[:10]
x_columns = get_x_columns()
train_x, train_y = train_df[x_columns], train_df.drop(columns=['full_text'])
predictor = BertWithHandcraftedFeaturePredictor(
model_name='bert-base-uncased',
catboost_iter=5000,
saving_dir='checkpoints/BertWithHandcraftedFeaturePredictor'
)
cv = CrossValidation(saving_dir=str(saving_dir), n_splits=config['n_splits'])
results = cv.fit(predictor, train_x, train_y)
print("CV results")
print(results)
results.to_csv("cv_results.csv")
print(f"CV mean: {results.iloc[len(results) - 1].mean()}")
submission_df = cv.predict(test_df)
submission_df.to_csv("submission.csv", index=False)
cv.save(config['saving_dir'])
print("Finished training!")
if __name__ == '__main__':
main()