|
import logging |
|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from typing import Tuple |
|
|
|
class DataSplitter: |
|
def __init__(self, df: pd.DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 42): |
|
""" |
|
Initialize the DataSplitter with a DataFrame and parameters for splitting. |
|
|
|
Parameters: |
|
df : pd.DataFrame |
|
The input dataframe to be split. |
|
target_column : str |
|
The name of the target column in the dataframe. |
|
test_size : float, optional |
|
The proportion of the dataset to include in the test split. Default is 0.2. |
|
random_state : int, optional |
|
Controls the shuffling applied to the data before splitting. Default is 42. |
|
""" |
|
self.df = df |
|
self.target_column = target_column |
|
self.test_size = test_size |
|
self.random_state = random_state |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
if self.target_column not in self.df.columns: |
|
raise ValueError(f"Target column '{self.target_column}' does not exist in the DataFrame.") |
|
|
|
logging.info("DataSplitter initialized successfully.") |
|
|
|
def split_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: |
|
""" |
|
Split the dataframe into train and test sets. |
|
|
|
Returns: |
|
Tuple of X_train, X_test, y_train, y_test: |
|
X_train : pd.DataFrame |
|
Training set features. |
|
X_test : pd.DataFrame |
|
Testing set features. |
|
y_train : pd.Series |
|
Training set target variable. |
|
y_test : pd.Series |
|
Testing set target variable. |
|
""" |
|
logging.info(f"Starting train-test split with test_size={self.test_size} and random_state={self.random_state}.") |
|
|
|
|
|
X = self.df.drop(columns=[self.target_column], axis=1) |
|
y = self.df[self.target_column] |
|
|
|
|
|
logging.info(f"Feature set shape: {X.shape}") |
|
logging.info(f"Target set shape: {y.shape}") |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X, y, test_size=self.test_size, random_state=self.random_state |
|
) |
|
|
|
|
|
logging.info(f"Train feature set shape: {X_train.shape}") |
|
logging.info(f"Test feature set shape: {X_test.shape}") |
|
logging.info(f"Train target set shape: {y_train.shape}") |
|
logging.info(f"Test target set shape: {y_test.shape}") |
|
logging.info("Train-test split completed successfully.") |
|
|
|
return X_train, X_test, y_train, y_test |
|
|