Spaces:

destiratnakomala
/

automation

Sleeping

App Files Files Community

automation / app.py

destiratnakomala

Update app.py

8a9febf verified 12 months ago

raw

history blame

14.1 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.linear_model import LinearRegression, Lasso
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error, r2_score
	import joblib
	import streamlit as st
	import plotly.express as px
	import plotly.figure_factory as ff

	# Main function
	def main():
	st.set_page_config(page_title="Data Automation-Machine Learning")
	st.title("Machine Learning")

	with st.expander("1: Add Your Data Source"):
	uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])

	# If no file is uploaded, load example.csv
	if uploaded_file is None:
	try:
	data = pd.read_csv('example.csv') # Load example CSV
	st.info("Loaded example.csv")
	except FileNotFoundError:
	st.error("Example CSV file not found. Please upload your own CSV file.")
	else:
	data = pd.read_csv(uploaded_file)

	with st.expander("2: DataSet Preview"):
	if uploaded_file is not None:

	data = pd.read_csv(uploaded_file)
	# Step 2: Data Overview
	view1, view2,view3, view4 = st.columns(4)
	with view1:
	st.write("Data Overview")
	st.dataframe(data.head())
	with view2:
	st.write(" Data Description")
	st.write(data.describe())
	with view3:
	st.write(" Missing Values")
	st.write(data.isnull().sum())
	with view4:
	st.write(" Data Types")
	st.write(data.dtypes)


	with st.expander("3: Data Cleaning"):
	# Step 3: Data Cleaning
	clean1, clean2, clean3 = st.columns(3)
	with clean1:
	st.write(" Data Summary Before Cleaning")
	st.write(data.describe())
	with clean2:
	st.write("Missing Values Before Cleaning:")
	st.write(data.isnull().sum())
	with clean3:
	# Visualize missing values
	if st.checkbox("Show Missing Values Heatmap"):
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
	plt.title("Missing Values Heatmap")
	st.pyplot(fig)

	clean4, clean5= st.columns(2)
	with clean4:
	# Remove duplicates
	if st.checkbox("Remove Duplicate Rows"):
	initial_shape = data.shape
	data = data.drop_duplicates()
	st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.")


	with clean5:
	# Handle missing values
	missing_strategy = st.selectbox(
	"Choose a strategy for handling missing values",
	options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"]
	)

	if st.button("Apply Missing Value Strategy"):
	if missing_strategy == "Drop Missing Values":
	data.dropna(inplace=True)
	st.success("Dropped rows with missing values.")
	elif missing_strategy == "Fill with Mean":
	data.fillna(data.mean(), inplace=True)
	st.success("Filled missing values with the mean.")
	elif missing_strategy == "Fill with Median":
	data.fillna(data.median(), inplace=True)
	st.success("Filled missing values with the median.")
	elif missing_strategy == "Fill with Mode":
	for column in data.select_dtypes(include=['object']).columns:
	data[column].fillna(data[column].mode()[0], inplace=True)
	st.success("Filled missing values with the mode for categorical columns.")
	elif missing_strategy == "Do Nothing":
	st.info("No changes made to missing values.")
	clean7, clean8= st.columns(2)
	with clean7:
	# Display basic info after cleaning
	st.write(" Data Summary After Cleaning")
	st.write(data.describe())
	with clean8:
	st.write("Missing Values After Cleaning:")
	st.write(data.isnull().sum())

	with st.expander('4: EDA'):

	# Step 4: Exploratory Data Analysis (EDA)
	st.write("Correlation Matrix")

	# Calculate the correlation matrix
	correlation_matrix = data.corr()

	# Create a heatmap using Plotly
	fig = ff.create_annotated_heatmap(
	z=correlation_matrix.values,
	x=list(correlation_matrix.columns),
	y=list(correlation_matrix.index),
	)

	# Update layout for better readability
	fig.update_layout(
	title="Correlation Matrix",
	xaxis_title="Features",
	yaxis_title="Features",
	width=700, # Adjust width as needed
	height=500, # Adjust height as needed
	)

	# Display the figure in Streamlit
	st.plotly_chart(fig)
	eda1, eda2= st.columns(2)
	with eda1:
	# Plotting distributions for numerical features
	if st.checkbox("Show Distribution Plots for Numeric Features"):
	for column in data.select_dtypes(include=[int, float]).columns:
	fig, ax = plt.subplots(figsize=(8, 4))
	sns.histplot(data[column], bins=30, kde=True, ax=ax)
	plt.title(f'Distribution of {column}')
	st.pyplot(fig)
	with eda2:
	# Boxplots for outlier detection
	if st.checkbox("Show Boxplots for Numeric Features"):
	for column in data.select_dtypes(include=[int, float]).columns:
	fig, ax = plt.subplots(figsize=(8, 4))
	sns.boxplot(x=data[column], ax=ax)
	plt.title(f'Boxplot of {column}')
	st.pyplot(fig)

	with st.expander("5: Feature Engineering"):
	target_column = st.selectbox("Select the target variable", options=data.columns)
	feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column))
	with st.expander("6: Modelling "):
	# Initialize session state for storing results
	if 'model_plot' not in st.session_state:
	st.session_state.model_plot = None
	if 'model_metrics' not in st.session_state:
	st.session_state.model_metrics = None

	# Model training
	model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])

	if st.button("Train Model (Without Hyperparameter Tuning)"):
	if feature_columns:
	X = data[feature_columns]
	y = data[target_column]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Initialize the selected model
	if model_option == "Linear Regression":
	model = LinearRegression()
	elif model_option == "Random Forest Regression":
	model = RandomForestRegressor(random_state=42)
	elif model_option == "Lasso Regression":
	model = Lasso()

	# Train model
	model.fit(X_train, y_train)

	# Save the model
	model_name = st.text_input('Enter model name', 'my_model')
	model_file_path = f'{model_name}.pkl'
	joblib.dump(model, model_file_path)
	st.success("Model saved successfully!")

	# Add a download button for the model
	with open(model_file_path, "rb") as f:
	st.download_button(
	label="Download Model",
	data=f,
	file_name=model_file_path,
	mime="application/octet-stream"
	)

	# Make predictions
	y_pred = model.predict(X_test)

	# Calculate metrics
	mse = mean_squared_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	# Step 7: Visualization of Predictions (Line Plot)
	st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
	st.session_state.model_metrics = (mse, r2)

	# Show results
	st.success(f"Mean Squared Error: {mse:.2f}")
	st.success(f"R^2 Score: {r2:.2f}")




	# Display model plot if available
	if st.session_state.model_plot is not None:
	y_test, y_pred = st.session_state.model_plot
	fig, ax = plt.subplots(figsize=(10, 6))
	ax.plot(y_test, label="True Values", color="blue", linestyle="--")
	ax.plot(y_pred, label="Predicted Values", color="orange")
	ax.set_title(f'{model_option}: True Values vs Predictions')
	ax.set_xlabel('Index')
	ax.set_ylabel('Values')
	ax.legend()
	st.pyplot(fig)

	# Display metrics if available
	if st.session_state.model_metrics is not None:
	mse, r2 = st.session_state.model_metrics
	st.success(f"Mean Squared Error: {mse:.2f}")
	st.success(f"R^2 Score: {r2:.2f}")


	with st.expander("7: HyperParameter"):
	# Step 8: Hyperparameter Tuning
	st.write("Hyperparameter Tuning")
	if feature_columns:
	hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])

	if hyperparam_model_option == "Linear Regression":
	param_grid = {'fit_intercept': [True, False]}
	elif hyperparam_model_option == "Random Forest Regression":
	param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
	elif hyperparam_model_option == "Lasso Regression":
	param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]}

	if st.button("Train Model with Hyperparameter Tuning"):
	# Prepare data for training
	X = data[feature_columns]
	y = data[target_column]

	# Split data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Initialize and perform hyperparameter tuning
	if hyperparam_model_option == "Linear Regression":
	model = LinearRegression()
	grid_search = GridSearchCV(model, param_grid, cv=5)
	elif hyperparam_model_option == "Random Forest Regression":
	model = RandomForestRegressor(random_state=42)
	grid_search = GridSearchCV(model, param_grid, cv=5)
	elif hyperparam_model_option == "Lasso Regression":
	model = Lasso()
	grid_search = GridSearchCV(model, param_grid, cv=5)

	# Train the model
	grid_search.fit(X_train, y_train)

	# Make predictions
	best_model = grid_search.best_estimator_
	y_pred = best_model.predict(X_test)

	# Calculate metrics
	mse = mean_squared_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	# Step 9: Visualization of Predictions (Line Plot)
	st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
	st.session_state.model_metrics = (mse, r2)

	# Show results
	st.success(f"Best Parameters: {grid_search.best_params_}")
	st.success(f"Mean Squared Error: {mse:.2f}")
	st.success(f"R^2 Score: {r2:.2f}")

	# Display hyperparameter tuned model plot if available
	if st.session_state.model_plot is not None:
	y_test, y_pred = st.session_state.model_plot
	fig, ax = plt.subplots(figsize=(10, 6))
	ax.plot(y_test, label="True Values", color="blue", linestyle="--")
	ax.plot(y_pred, label="Predicted Values", color="orange")
	ax.set_title(f'{hyperparam_model_option}: True Values vs Predictions (Tuned)')
	ax.set_xlabel('Index')
	ax.set_ylabel('Values')
	ax.legend()
	st.pyplot(fig)

	# Display metrics if available
	if st.session_state.model_metrics is not None:
	mse, r2 = st.session_state.model_metrics
	st.success(f"Mean Squared Error: {mse:.2f}")
	st.success(f"R^2 Score: {r2:.2f}")



	# Run the app
	if __name__ == "__main__":
	main()