###### SUPER SAFE ###### import pandas as pd import numpy as np import streamlit as st import pandas as pd import numpy as np import seaborn as sn import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.metrics import confusion_matrix, classification_report from sklearn.model_selection import train_test_split import xgboost as xgb from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import numpy as np import plotly.figure_factory as ff st.set_page_config( layout="wide", ) def point_estimates(df, model_type, flag, identifier, control_sample_size, solver=None, max_iter=None, class_weights=None, max_depth=None, subsample=None, eta=None): # if set(df[df[flag] == 0][identifier]).intersection(set(df[df[flag] == 1][identifier])): # st.error("The identifier should not be common between flag values 0 and 1.") Xs = df.drop(columns=[identifier, flag],axis=1) X_scaled = StandardScaler().fit_transform(Xs) n_comp = len(Xs.columns) pca = PCA(n_components=n_comp) pca.fit(X_scaled) princ_comp = pca.transform(X_scaled) PCA_DF = pd.DataFrame(princ_comp) pca_var = pca.explained_variance_ratio_[0:n_comp].cumsum() idx = [i for i in range(len(pca_var)) if pca_var[i] > 0.995][0] df_pca = PCA_DF.loc[:, 0:idx] df_pca[flag]=df[flag] print(df_pca) #creating train and control datasets df_train = df_pca[df_pca[flag] == 1] df_control = df_pca[df_pca[flag] == 0] df_control_sample = df_control.sample(n=control_sample_size, random_state=42) final_df_sample = pd.concat([df_train, df_control_sample], ignore_index=True) non_req_cols=[flag] req_cols=df_pca.columns[~df_pca.columns.isin(non_req_cols)] # create a holdout set identifier_df, X, y = df[[identifier]], final_df_sample[req_cols], final_df_sample[[flag]] if model_type == 'linear': # scale features # min_max_scaler = MinMaxScaler() # X_norm = min_max_scaler.fit_transform(X) #X_norm = (X - X.min()) / (X.max() - X.min()) # fit model model = LogisticRegression(solver=solver, max_iter=max_iter, class_weight=class_weights) model.fit(X, y) #feature importances coefs = model.coef_[0] feats = X.columns importance_df = pd.DataFrame({'features':feats, 'coefficients':coefs}) importance_df['abs_coef'] = np.abs(importance_df['coefficients']) elif model_type == 'xgboost': model = xgb.XGBClassifier(max_depth=max_depth, subsample=subsample, eta=eta) model.fit(X, y) importance = model.feature_importances_ feats = X.columns importance_df = pd.DataFrame({'features':feats, 'Importance':importance}) #Prediction Y_pred = model.predict(X) #Confusion matrix #cm = confusion_matrix(y, Y_pred)/y.shape[0] cm = confusion_matrix(y, Y_pred) / len(y) # Create DataFrame for confusion matrix classes = np.unique(y) df_cm = pd.DataFrame(cm, index=classes, columns=classes) # Create hover text hover_text = [['Actual: {}
Predicted: {}
Value: {:.2f}'.format(y.iloc[i, 0], Y_pred[i], cm[i, j]) for j in range(len(classes))] for i in range(len(classes))] # Create heatmap using Plotly with hover text fig = ff.create_annotated_heatmap(z=df_cm.values, x=list(classes), y=list(classes), colorscale='blues', hoverinfo='text', text=hover_text) # Update heatmap layout fig.update_layout( title='Confusion Matrix', xaxis_title='Predicted', yaxis_title='Actual', font=dict(size=14) ) # Display Plotly figure in Streamlit #st.plotly_chart(fig) #classification report report = classification_report(y, Y_pred, output_dict=True) # Convert the classification report to a DataFrame report_df = pd.DataFrame(report).transpose() # prep data X, y = df_pca[req_cols], df_pca[[flag]] #X, y = df.drop(columns=[flag,identifier]), df[[flag]] # scale features # min_max_scaler = MinMaxScaler() # X_norm = min_max_scaler.fit_transform(X) #X_norm = (X - X.min()) / (X.max() - X.min()) # run inference y_pred_proba = model.predict_proba(X) y_pred_df = pd.DataFrame(y_pred_proba) df_pca.insert(0, 'propensity_score', y_pred_df[1]) # df_pca[identifier] = identifier_df # df_pca[identifier]=df_pca[identifier].astype('str') # Display classification report st.subheader("Classification Report") st.dataframe(report_df,width=600) # Display confusion matrix # st.subheader("Confusion Matrix") # st.write(df_cm,width=600) # Display confusion matrix st.subheader("Confusion matrix") st.plotly_chart(fig) return df_pca[['propensity_score']] # if 'df' in st.session_state: # task_type = st.sidebar.selectbox("Task Type", ["classification", "regression"],key="task_type") # model_type = st.sidebar.selectbox("Model Type", ["linear", "xgboost"]) # flag = st.sidebar.selectbox("Flag Column", [None] + list(st.session_state.df.columns)) # identifier = st.sidebar.selectbox("Identifier Column", [None] + list(st.session_state.df.columns)) # st.sidebar.write("Applicable only for Regression model type") # dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns)) # st.session_state.flag=flag # st.session_state.identifier=identifier # # Sidebar for user inputs # if flag is not None: # with st.expander("Model Configuration", expanded=True): # unique_flag_values = st.session_state.df[flag].unique() # for value in unique_flag_values: # st.write(f"Y == {value}: {len(st.session_state.df[st.session_state.df[flag] == value])}") # control_sample_size = st.text_input("Control Sample Size") # try: # # Try converting to an integer # control_sample_size = int(control_sample_size) # # Check if control_sample_size is within the valid range # flag_0_size = len(st.session_state.df[st.session_state.df[flag] == 0]) # if control_sample_size < 0 or control_sample_size > flag_0_size: # st.error(f"Control Sample Size must be between 0 and {flag_0_size}.") # except ValueError: # st.error("Please enter a valid integer for Control Sample Size.") # #st.write("Applicable only for Regression model type") # #if st.session_state.get("task_type","") == "regression": # #dep_var = st.sidebar.selectbox("Dependent Variable (Regression)", [None] + list(st.session_state.df.columns)) # point_estimate_variable = st.text_input("Variable of interest") # st.session_state.point_estimate_variable=point_estimate_variable # if st.button("Run Modeling"): # result_df = point_estimates(st.session_state.df, task_type, model_type, point_estimate_variable, control_sample_size, flag, identifier, dep_var) # st.session_state.modeling_df = result_df # st.session_state.treated_df=result_df[result_df['Y']==1] # st.session_state.non_treated_df=result_df[result_df['Y']==0] st.title("Algorithms") #st.subheader("Classification") # Added line #classification_option = st.radio("Classification", ["Classification"]) # Added line if 'classification_option' not in st.session_state: st.session_state.classification_option = "Classification" if 'algorithm_option' not in st.session_state: st.session_state.algorithm_option = "Logistic Regression" classification_option = st.radio("Algorithm Type", ["Classification", "Regression"], key="classification_option") if classification_option != st.session_state.classification_option: st.session_state.classification_option = classification_option if st.session_state.classification_option == "Classification": col1, col2 = st.columns(2) with col1: st.write("#####") lr_checkbox = st.checkbox( label="Logistic Regression", key="algorithm_lr_cb", value=(st.session_state.algorithm_option == "Logistic Regression") ) with col2: st.write("#####") show_lr_options = st.checkbox( label="Change default options", key="lr_options_cb", disabled=not lr_checkbox, ) cols = st.columns((2, 1)) with cols[0]: lr_hyp_placeholder = st.empty() lr_model_placeholder = st.empty() solver='lbfgs' class_weights=None max_iter=1000 if show_lr_options and lr_checkbox: with lr_hyp_placeholder: with st.expander("LR parameters"): solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag']) max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000) class_weight_option = st.selectbox( 'Select class weights option:', ('Custom', 'Balanced') ) if class_weight_option == 'Custom': weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1) weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1) class_weights = {1: weight_1, 0: weight_0} elif class_weight_option == 'Balanced': class_weights = {1: 0.5, 0: 0.5} #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1])) col1, col2 = st.columns(2) with col1: st.write("#####") xgb_checkbox = st.checkbox( label="Xgboost Classifier", key="algorithm_xgb_cb", value=(st.session_state.algorithm_option == "Xgboost Classifier") ) with col2: st.write("#####") show_xgb_options = st.checkbox( label="Change default options", key="xgb_options_cb", disabled=not xgb_checkbox, ) cols = st.columns((2, 1)) with cols[0]: xgb_hyp_placeholder = st.empty() max_depth=None subsample=None eta=None if show_xgb_options and xgb_checkbox: with xgb_hyp_placeholder: with st.expander("XGB hyper parameters"): max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1) subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1) eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01) #control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1])) st.session_state.algorithm_option = "Logistic Regression" if lr_checkbox else "Xgboost Classifier" elif classification_option == "Regression": col1, col2 = st.columns(2) with col1: st.write("#####") lr_checkbox = st.checkbox( label="Linear Regression", key="algorithm_lr_cb", value=(st.session_state.algorithm_option == "Linear Regression") ) with col2: st.write("#####") show_lr_options = st.checkbox( label="Change default options", key="lr_options_cb", disabled=not lr_checkbox, ) cols = st.columns((2, 1)) with cols[0]: lr_hyp_placeholder = st.empty() lr_model_placeholder = st.empty() solver='lbfgs' class_weights=None max_iter=1000 if show_lr_options and lr_checkbox: with lr_hyp_placeholder: with st.expander("LR parameters"): solver=st.selectbox('Solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag']) max_iter=st.slider('Max Iterations', min_value=100, max_value=10000, value=1000) class_weight_option = st.selectbox( 'Select class weights option:', ('Custom', 'Balanced') ) if class_weight_option == 'Custom': weight_1 = st.number_input('Weight for class 1', min_value=0.0, max_value=1.0, value=0.4, step=0.1) weight_0 = st.number_input('Weight for class 0', min_value=0.0, max_value=1.0, value=0.6, step=0.1) class_weights = {1: weight_1, 0: weight_0} elif class_weight_option == 'Balanced': class_weights = {1: 0.5, 0: 0.5} col1, col2 = st.columns(2) with col1: st.write("#####") xgb_checkbox = st.checkbox( label="Xgboost Regression", key="algorithm_xgb_cb", value=(st.session_state.algorithm_option == "Xgboost Regression") ) with col2: st.write("#####") show_xgb_options = st.checkbox( label="Change default options", key="xgb_options_cb", disabled=not xgb_checkbox, ) cols = st.columns((2, 1)) with cols[0]: xgb_hyp_placeholder = st.empty() max_depth=None subsample=None eta=None if show_xgb_options and xgb_checkbox: with xgb_hyp_placeholder: with st.expander("XGB hyper parameters"): max_depth = st.slider("max_depth", min_value=1, max_value=10, value=3, step=1) subsample = st.slider("subsample", min_value=0.1, max_value=1.0, value=0.8, step=0.1) eta = st.slider("learning rate", min_value=0.01, max_value=0.5, value=0.3, step=0.01) st.session_state.algorithm_option = "Linear Regression" if lr_checkbox else "Xgboost Regression" with cols[0]: control_sample_size = st.slider('Control Sample Size', min_value=1, max_value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 0]), value=len(st.session_state.imputed_df[st.session_state.imputed_df[st.session_state.flag] == 1])) #st.subheader("Classification") # Added line #classification_option = st.radio("Classification", ["Classification"]) # Added line if st.button("Run Modeling"): if lr_checkbox: st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='linear',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,solver=solver,max_iter=max_iter,class_weights=class_weights) elif xgb_checkbox: st.session_state.binned_df['propensity_score'] = point_estimates(st.session_state.binned_df,model_type='xgboost',flag=st.session_state.flag,identifier=st.session_state.identifier,control_sample_size=control_sample_size,max_depth=max_depth, subsample=subsample, eta=eta) # st.session_state.binned_df['propensity_score'] = result_df['propensity_score'] st.session_state.treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 1] st.session_state.non_treated_df = st.session_state.binned_df[st.session_state.binned_df['Y'] == 0]