linguask / src /solutions /many_bert_solution.py
GitHub Action
refs/heads/ci-cd/hugging-face
8b414b0
raw
history blame
5.4 kB
import os
from pathlib import Path
from typing import Union
import pandas as pd
import torch.cuda
from catboost import CatBoostRegressor
from catboost.utils import get_gpu_device_count
from easydict import EasyDict as edict
from src.cross_validate import CrossValidation
from src.feature_extractors.bert_pretrain_extractor import \
ManyBertPretrainFeatureExtractor
from src.feature_extractors.text_statistics_extractor import \
HandcraftedTextFeatureExtractor
from src.solutions.base_solution import BaseSolution
from src.solutions.constant_predictor import load_train_test_df
from src.spell_checker import SmartSpellChecker
from src.text_preprocessings.spellcheck_preprocessing import \
SpellcheckTextPreprocessor
from src.utils import get_x_columns, seed_everything, validate_x, validate_y
seed_everything()
spellcheck = SmartSpellChecker()
class ManyBertWithHandcraftedFeaturePredictor(BaseSolution):
def __init__(
self,
model_names: list,
catboost_iter: int,
saving_dir: str,
):
super(ManyBertWithHandcraftedFeaturePredictor, self).__init__()
self.feature_extractor = HandcraftedTextFeatureExtractor(spellcheck)
self.text_preprocessing = SpellcheckTextPreprocessor(spellcheck)
self.berts = ManyBertPretrainFeatureExtractor(model_names=model_names)
self.device = 'GPU' if torch.cuda.is_available() else None
self.task_type = 'GPU' if get_gpu_device_count() > 0 else 'CPU'
# classification model for each column
self.columns = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
self.models = [
CatBoostRegressor(
iterations=catboost_iter,
task_type=self.task_type,
verbose=True,
) for _ in range(len(self.columns))
]
def transform_data(self, X: pd.Series) -> pd.DataFrame:
cleaned_text = self.text_preprocessing.preprocess_data(X)
bert_features = self.berts.generate_features(cleaned_text)
handcrafted_features = self.feature_extractor.generate_features(X)
features_df = pd.concat([bert_features, handcrafted_features], axis='columns')
return features_df
def fit(self, X: pd.DataFrame, y: pd.DataFrame, **kwargs) -> None:
validate_x(X)
validate_y(y)
features_df = self.transform_data(X.full_text)
for ii, column in enumerate(self.columns):
print(f"-> Training model on: {column}...")
model = self.models[ii]
target = y[column]
torch.cuda.empty_cache()
model.fit(X=features_df, y=target)
torch.cuda.empty_cache()
def predict(self, X: pd.DataFrame) -> pd.DataFrame:
validate_x(X)
features_df = self.transform_data(X.full_text)
prediction = {}
for ii, column in enumerate(self.columns):
print(f"-> Predicting model on: {column}")
model = self.models[ii]
prediction[column] = model.predict(features_df)
y_pred = pd.DataFrame(prediction, index=X.index)
y_pred['text_id'] = X.text_id
return y_pred
def save(self, directory: Union[str, Path]) -> None:
directory = Path(directory)
if not directory.exists():
directory.mkdir(parents=True)
for ii, model in enumerate(self.models):
column = self.columns[ii]
path = directory / f'catboost_{column}.cbm'
model.save_model(str(path))
def load(self, directory: Union[str, Path]) -> None:
directory = Path(directory)
if not directory.is_dir():
raise OSError(f"Dir. {directory.absolute()} does not exist")
for ii, model in enumerate(self.models):
column = self.columns[ii]
path = directory / f'catboost_{column}.cbm'
model.load_model(str(path))
def main():
config = edict(
dict(
model_names=[
'bert-base-uncased',
'bert-base-cased',
'vblagoje/bert-english-uncased-finetuned-pos',
'bert-base-multilingual-cased',
'unitary/toxic-bert',
'bert-large-uncased'
],
catboost_iter=5000,
n_splits=5,
saving_dir='checkpoints/ManyBertWithHandcraftedFeaturePredictor',
)
)
train_df, test_df = load_train_test_df()
x_columns = get_x_columns()
train_x, train_y = train_df[x_columns], train_df.drop(columns=['full_text'])
predictor = ManyBertWithHandcraftedFeaturePredictor(
model_names=config.model_names,
catboost_iter=config.catboost_iter,
saving_dir=config.saving_dir,
)
cv = CrossValidation(saving_dir=config.saving_dir, n_splits=config.n_splits)
results = cv.fit(predictor, train_x, train_y)
print("CV results")
print(results)
print(f"CV mean: {results.iloc[len(results) - 1].mean()}")
cv.save(config.saving_dir)
submission_df = cv.predict(test_df)
submission_path = os.path.join(config.saving_dir, "submission.csv")
submission_df.to_csv(submission_path, index=False)
cv_results_path = os.path.join(config.saving_dir, "cv_results.csv")
results.to_csv(cv_results_path)
print("Finished training!")
if __name__ == '__main__':
main()