Spaces:
Sleeping
Sleeping
File size: 3,794 Bytes
54e6328 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import sys
from dataclasses import dataclass
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from src.exception import CustomException
from src.logger import logging
import os
from src.utils import save_object
@dataclass
class DataTransformationConfig:
preprocessor_obj_file_path=os.path.join('artifacts',"proprocessor.pkl")
class DataTransformation:
def __init__(self):
self.data_transformation_config=DataTransformationConfig()
def get_data_transformer_object(self):
'''
This function si responsible for data trnasformation
'''
try:
numerical_columns = ["writing_score", "reading_score"]
categorical_columns = [
"gender",
"race_ethnicity",
"parental_level_of_education",
"lunch",
"test_preparation_course",
]
num_pipeline= Pipeline(
steps=[
("imputer",SimpleImputer(strategy="median")),
("scaler",StandardScaler())
]
)
cat_pipeline=Pipeline(
steps=[
("imputer",SimpleImputer(strategy="most_frequent")),
("one_hot_encoder",OneHotEncoder()),
("scaler",StandardScaler(with_mean=False))
]
)
logging.info(f"Categorical columns: {categorical_columns}")
logging.info(f"Numerical columns: {numerical_columns}")
preprocessor=ColumnTransformer(
[
("num_pipeline",num_pipeline,numerical_columns),
("cat_pipelines",cat_pipeline,categorical_columns)
]
)
return preprocessor
except Exception as e:
raise CustomException(e,sys)
def initiate_data_transformation(self,train_path,test_path):
try:
train_df=pd.read_csv(train_path)
test_df=pd.read_csv(test_path)
logging.info("Read train and test data completed")
logging.info("Obtaining preprocessing object")
preprocessing_obj=self.get_data_transformer_object()
target_column_name="math_score"
numerical_columns = ["writing_score", "reading_score"]
input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df=train_df[target_column_name]
input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test_df[target_column_name]
logging.info(
f"Applying preprocessing object on training dataframe and testing dataframe."
)
input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)
train_arr = np.c_[
input_feature_train_arr, np.array(target_feature_train_df)
]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
logging.info(f"Saved preprocessing object.")
save_object(
file_path=self.data_transformation_config.preprocessor_obj_file_path,
obj=preprocessing_obj
)
return (
train_arr,
test_arr,
self.data_transformation_config.preprocessor_obj_file_path,
)
except Exception as e:
raise CustomException(e,sys) |