File size: 14,098 Bytes
6c88ced
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a9febf
 
 
 
 
 
 
 
 
 
6c88ced
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import streamlit as st
import plotly.express as px
import plotly.figure_factory as ff

# Main function
def main():
    st.set_page_config(page_title="Data Automation-Machine Learning")
    st.title("Machine Learning")

    with st.expander("1: Add Your Data Source"):
        uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
        
        # If no file is uploaded, load example.csv
        if uploaded_file is None:
            try:
                data = pd.read_csv('example.csv')  # Load example CSV
                st.info("Loaded example.csv")
            except FileNotFoundError:
                st.error("Example CSV file not found. Please upload your own CSV file.")
        else:
            data = pd.read_csv(uploaded_file)

    with st.expander("2: DataSet Preview"):
        if uploaded_file is not None:

            data = pd.read_csv(uploaded_file)
            # Step 2: Data Overview
            view1, view2,view3, view4 = st.columns(4)
            with view1: 
                st.write("Data Overview")
                st.dataframe(data.head())
            with view2:
                st.write(" Data Description")
                st.write(data.describe())
            with view3:
                st.write(" Missing Values")
                st.write(data.isnull().sum())
            with view4:
                st.write(" Data Types")
                st.write(data.dtypes)


    with st.expander("3: Data Cleaning"):           
            # Step 3: Data Cleaning
            clean1, clean2, clean3 = st.columns(3)
            with clean1: 
                st.write(" Data Summary Before Cleaning")
                st.write(data.describe())
            with clean2:
                st.write("Missing Values Before Cleaning:")
                st.write(data.isnull().sum())
            with clean3:
                # Visualize missing values
                if st.checkbox("Show Missing Values Heatmap"):
                    fig, ax = plt.subplots(figsize=(10, 6))
                    sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
                    plt.title("Missing Values Heatmap")
                    st.pyplot(fig)

            clean4, clean5= st.columns(2)
            with clean4:
                # Remove duplicates
                if st.checkbox("Remove Duplicate Rows"):
                    initial_shape = data.shape
                    data = data.drop_duplicates()
                    st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.")


            with clean5:
            # Handle missing values
                missing_strategy = st.selectbox(
                    "Choose a strategy for handling missing values",
                    options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"]
                )

                if st.button("Apply Missing Value Strategy"):
                    if missing_strategy == "Drop Missing Values":
                        data.dropna(inplace=True)
                        st.success("Dropped rows with missing values.")
                    elif missing_strategy == "Fill with Mean":
                        data.fillna(data.mean(), inplace=True)
                        st.success("Filled missing values with the mean.")
                    elif missing_strategy == "Fill with Median":
                        data.fillna(data.median(), inplace=True)
                        st.success("Filled missing values with the median.")
                    elif missing_strategy == "Fill with Mode":
                        for column in data.select_dtypes(include=['object']).columns:
                            data[column].fillna(data[column].mode()[0], inplace=True)
                        st.success("Filled missing values with the mode for categorical columns.")
                    elif missing_strategy == "Do Nothing":
                        st.info("No changes made to missing values.")
            clean7, clean8= st.columns(2)
            with clean7:
                # Display basic info after cleaning
                st.write(" Data Summary After Cleaning")
                st.write(data.describe())
            with clean8:
                st.write("Missing Values After Cleaning:")
                st.write(data.isnull().sum())
    
    with st.expander('4: EDA'):
            
            # Step 4: Exploratory Data Analysis (EDA)
            st.write("Correlation Matrix")

            # Calculate the correlation matrix
            correlation_matrix = data.corr()

            # Create a heatmap using Plotly
            fig = ff.create_annotated_heatmap(
                z=correlation_matrix.values,
                x=list(correlation_matrix.columns),
                y=list(correlation_matrix.index),
            )

            # Update layout for better readability
            fig.update_layout(
                title="Correlation Matrix",
                xaxis_title="Features",
                yaxis_title="Features",
                width=700,  # Adjust width as needed
                height=500,  # Adjust height as needed
            )

            # Display the figure in Streamlit
            st.plotly_chart(fig)
            eda1, eda2= st.columns(2)
            with eda1:
                # Plotting distributions for numerical features
                if st.checkbox("Show Distribution Plots for Numeric Features"):
                    for column in data.select_dtypes(include=[int, float]).columns:
                        fig, ax = plt.subplots(figsize=(8, 4))
                        sns.histplot(data[column], bins=30, kde=True, ax=ax)
                        plt.title(f'Distribution of {column}')
                        st.pyplot(fig)
            with eda2:
                # Boxplots for outlier detection
                if st.checkbox("Show Boxplots for Numeric Features"):
                    for column in data.select_dtypes(include=[int, float]).columns:
                        fig, ax = plt.subplots(figsize=(8, 4))
                        sns.boxplot(x=data[column], ax=ax)
                        plt.title(f'Boxplot of {column}')
                        st.pyplot(fig)

    with st.expander("5: Feature Engineering"):           
            target_column = st.selectbox("Select the target variable", options=data.columns)
            feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column))
    with st.expander("6: Modelling "):
            # Initialize session state for storing results
            if 'model_plot' not in st.session_state:
                st.session_state.model_plot = None
            if 'model_metrics' not in st.session_state:
                st.session_state.model_metrics = None

            # Model training
            model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])

            if st.button("Train Model (Without Hyperparameter Tuning)"):
                if feature_columns:
                    X = data[feature_columns]
                    y = data[target_column]
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                    # Initialize the selected model
                    if model_option == "Linear Regression":
                        model = LinearRegression()
                    elif model_option == "Random Forest Regression":
                        model = RandomForestRegressor(random_state=42)
                    elif model_option == "Lasso Regression":
                        model = Lasso()

                    # Train model
                    model.fit(X_train, y_train)

                    # Save the model
                    model_name = st.text_input('Enter model name', 'my_model')
                    model_file_path = f'{model_name}.pkl'
                    joblib.dump(model, model_file_path)
                    st.success("Model saved successfully!")

                    # Add a download button for the model
                    with open(model_file_path, "rb") as f:
                        st.download_button(
                            label="Download Model",
                            data=f,
                            file_name=model_file_path,
                            mime="application/octet-stream"
                        )

                    # Make predictions
                    y_pred = model.predict(X_test)

                    # Calculate metrics
                    mse = mean_squared_error(y_test, y_pred)
                    r2 = r2_score(y_test, y_pred)

                    # Step 7: Visualization of Predictions (Line Plot)
                    st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
                    st.session_state.model_metrics = (mse, r2)

                    # Show results
                    st.success(f"Mean Squared Error: {mse:.2f}")
                    st.success(f"R^2 Score: {r2:.2f}")




            # Display model plot if available
            if st.session_state.model_plot is not None:
                y_test, y_pred = st.session_state.model_plot
                fig, ax = plt.subplots(figsize=(10, 6))
                ax.plot(y_test, label="True Values", color="blue", linestyle="--")
                ax.plot(y_pred, label="Predicted Values", color="orange")
                ax.set_title(f'{model_option}: True Values vs Predictions')
                ax.set_xlabel('Index')
                ax.set_ylabel('Values')
                ax.legend()
                st.pyplot(fig)

                # Display metrics if available
                if st.session_state.model_metrics is not None:
                    mse, r2 = st.session_state.model_metrics
                    st.success(f"Mean Squared Error: {mse:.2f}")
                    st.success(f"R^2 Score: {r2:.2f}")


    with st.expander("7: HyperParameter"):
            # Step 8: Hyperparameter Tuning
            st.write("Hyperparameter Tuning")
            if feature_columns:
                hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])

                if hyperparam_model_option == "Linear Regression":
                    param_grid = {'fit_intercept': [True, False]}
                elif hyperparam_model_option == "Random Forest Regression":
                    param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
                elif hyperparam_model_option == "Lasso Regression":
                    param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]}

                if st.button("Train Model with Hyperparameter Tuning"):
                    # Prepare data for training
                    X = data[feature_columns]
                    y = data[target_column]

                    # Split data into training and testing sets
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                    # Initialize and perform hyperparameter tuning
                    if hyperparam_model_option == "Linear Regression":
                        model = LinearRegression()
                        grid_search = GridSearchCV(model, param_grid, cv=5)
                    elif hyperparam_model_option == "Random Forest Regression":
                        model = RandomForestRegressor(random_state=42)
                        grid_search = GridSearchCV(model, param_grid, cv=5)
                    elif hyperparam_model_option == "Lasso Regression":
                        model = Lasso()
                        grid_search = GridSearchCV(model, param_grid, cv=5)

                    # Train the model
                    grid_search.fit(X_train, y_train)

                    # Make predictions
                    best_model = grid_search.best_estimator_
                    y_pred = best_model.predict(X_test)

                    # Calculate metrics
                    mse = mean_squared_error(y_test, y_pred)
                    r2 = r2_score(y_test, y_pred)

                    # Step 9: Visualization of Predictions (Line Plot)
                    st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
                    st.session_state.model_metrics = (mse, r2)

                    # Show results
                    st.success(f"Best Parameters: {grid_search.best_params_}")
                    st.success(f"Mean Squared Error: {mse:.2f}")
                    st.success(f"R^2 Score: {r2:.2f}")

                # Display hyperparameter tuned model plot if available
                if st.session_state.model_plot is not None:
                    y_test, y_pred = st.session_state.model_plot
                    fig, ax = plt.subplots(figsize=(10, 6))
                    ax.plot(y_test, label="True Values", color="blue", linestyle="--")
                    ax.plot(y_pred, label="Predicted Values", color="orange")
                    ax.set_title(f'{hyperparam_model_option}: True Values vs Predictions (Tuned)')
                    ax.set_xlabel('Index')
                    ax.set_ylabel('Values')
                    ax.legend()
                    st.pyplot(fig)

                    # Display metrics if available
                    if st.session_state.model_metrics is not None:
                        mse, r2 = st.session_state.model_metrics
                        st.success(f"Mean Squared Error: {mse:.2f}")
                        st.success(f"R^2 Score: {r2:.2f}")



# Run the app
if __name__ == "__main__":
    main()