File size: 2,733 Bytes
92b63f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import Tuple

class DataSplitter:
    def __init__(self, df: pd.DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 42):
        """
        Initialize the DataSplitter with a DataFrame and parameters for splitting.

        Parameters:
        df : pd.DataFrame
            The input dataframe to be split.
        target_column : str
            The name of the target column in the dataframe.
        test_size : float, optional
            The proportion of the dataset to include in the test split. Default is 0.2.
        random_state : int, optional
            Controls the shuffling applied to the data before splitting. Default is 42.
        """
        self.df = df
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state
        
        # Configure logging
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

        # Check if target_column exists in the DataFrame
        if self.target_column not in self.df.columns:
            raise ValueError(f"Target column '{self.target_column}' does not exist in the DataFrame.")
        
        logging.info("DataSplitter initialized successfully.")

    def split_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        """
        Split the dataframe into train and test sets.

        Returns:
        Tuple of X_train, X_test, y_train, y_test:
        X_train : pd.DataFrame
            Training set features.
        X_test : pd.DataFrame
            Testing set features.
        y_train : pd.Series
            Training set target variable.
        y_test : pd.Series
            Testing set target variable.
        """
        logging.info(f"Starting train-test split with test_size={self.test_size} and random_state={self.random_state}.")

        
        X = self.df.drop(columns=[self.target_column], axis=1)
        y = self.df[self.target_column]

       
        logging.info(f"Feature set shape: {X.shape}")
        logging.info(f"Target set shape: {y.shape}")

        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state
        )

        
        logging.info(f"Train feature set shape: {X_train.shape}")
        logging.info(f"Test feature set shape: {X_test.shape}")
        logging.info(f"Train target set shape: {y_train.shape}")
        logging.info(f"Test target set shape: {y_test.shape}")
        logging.info("Train-test split completed successfully.")

        return X_train, X_test, y_train, y_test