Spaces:

Sarathkumar1304ai
/

streamlitwebapp

Running

App Files Files Community

streamlitwebapp / src /data_splitting.py

Sarathkumar1304ai

Upload 91 files

2953afe verified 7 months ago

raw

history blame

2.73 kB

	import logging
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from typing import Tuple

	class DataSplitter:
	def __init__(self, df: pd.DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 42):
	"""
	Initialize the DataSplitter with a DataFrame and parameters for splitting.

	Parameters:
	df : pd.DataFrame
	The input dataframe to be split.
	target_column : str
	The name of the target column in the dataframe.
	test_size : float, optional
	The proportion of the dataset to include in the test split. Default is 0.2.
	random_state : int, optional
	Controls the shuffling applied to the data before splitting. Default is 42.
	"""
	self.df = df
	self.target_column = target_column
	self.test_size = test_size
	self.random_state = random_state

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# Check if target_column exists in the DataFrame
	if self.target_column not in self.df.columns:
	raise ValueError(f"Target column '{self.target_column}' does not exist in the DataFrame.")

	logging.info("DataSplitter initialized successfully.")

	def split_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
	"""
	Split the dataframe into train and test sets.

	Returns:
	Tuple of X_train, X_test, y_train, y_test:
	X_train : pd.DataFrame
	Training set features.
	X_test : pd.DataFrame
	Testing set features.
	y_train : pd.Series
	Training set target variable.
	y_test : pd.Series
	Testing set target variable.
	"""
	logging.info(f"Starting train-test split with test_size={self.test_size} and random_state={self.random_state}.")


	X = self.df.drop(columns=[self.target_column], axis=1)
	y = self.df[self.target_column]


	logging.info(f"Feature set shape: {X.shape}")
	logging.info(f"Target set shape: {y.shape}")


	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=self.test_size, random_state=self.random_state
	)


	logging.info(f"Train feature set shape: {X_train.shape}")
	logging.info(f"Test feature set shape: {X_test.shape}")
	logging.info(f"Train target set shape: {y_train.shape}")
	logging.info(f"Test target set shape: {y_test.shape}")
	logging.info("Train-test split completed successfully.")

	return X_train, X_test, y_train, y_test