destiratnakomala commited on
Commit
da983ca
·
verified ·
1 Parent(s): 77e4ac2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -268
app.py CHANGED
@@ -7,7 +7,6 @@ from sklearn.linear_model import LinearRegression, Lasso
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import joblib
10
- import streamlit as st
11
  import plotly.express as px
12
  import plotly.figure_factory as ff
13
 
@@ -18,7 +17,7 @@ def main():
18
 
19
  with st.expander("1: Add Your Data Source"):
20
  uploaded_file = st.file_uploader("Upload your CSV or Excel file", type=["csv", "xlsx", "xls"])
21
-
22
  if uploaded_file is None:
23
  try:
24
  data = pd.read_csv('example.csv') # Load example CSV
@@ -46,284 +45,189 @@ def main():
46
  except Exception as e:
47
  st.error(f"An error occurred: {e}")
48
 
49
-
50
  with st.expander("2: DataSet Preview"):
51
  if uploaded_file is not None:
52
- data = pd.read_csv(uploaded_file)
53
- else:
54
- data = pd.read_csv('example.csv')
55
- # Step 2: Data Overview
56
- view1, view2,view3, view4 = st.columns(4)
57
- with view1:
58
- st.write("Data Overview")
59
- st.dataframe(data.head())
60
- with view2:
61
- st.write(" Data Description")
62
- st.write(data.describe())
63
- with view3:
64
- st.write(" Missing Values")
65
- st.write(data.isnull().sum())
66
- with view4:
67
- st.write(" Data Types")
68
- st.write(data.dtypes)
69
-
70
 
71
  with st.expander("3: Data Cleaning"):
72
- # Step 3: Data Cleaning
73
- clean1, clean2, clean3 = st.columns(3)
74
- with clean1:
75
- st.write(" Data Summary Before Cleaning")
76
- st.write(data.describe())
77
- with clean2:
78
- st.write("Missing Values Before Cleaning:")
79
- st.write(data.isnull().sum())
80
- with clean3:
81
- # Visualize missing values
82
- if st.checkbox("Show Missing Values Heatmap"):
83
- fig, ax = plt.subplots(figsize=(10, 6))
84
- sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
85
- plt.title("Missing Values Heatmap")
86
- st.pyplot(fig)
87
-
88
- clean4, clean5= st.columns(2)
89
- with clean4:
90
- # Remove duplicates
91
- if st.checkbox("Remove Duplicate Rows"):
92
- initial_shape = data.shape
93
- data = data.drop_duplicates()
94
- st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.")
95
-
96
-
97
- with clean5:
98
- # Handle missing values
99
- missing_strategy = st.selectbox(
100
- "Choose a strategy for handling missing values",
101
- options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"]
102
- )
103
-
104
- if st.button("Apply Missing Value Strategy"):
105
- if missing_strategy == "Drop Missing Values":
106
- data.dropna(inplace=True)
107
- st.success("Dropped rows with missing values.")
108
- elif missing_strategy == "Fill with Mean":
109
- data.fillna(data.mean(), inplace=True)
110
- st.success("Filled missing values with the mean.")
111
- elif missing_strategy == "Fill with Median":
112
- data.fillna(data.median(), inplace=True)
113
- st.success("Filled missing values with the median.")
114
- elif missing_strategy == "Fill with Mode":
115
- for column in data.select_dtypes(include=['object']).columns:
116
- data[column].fillna(data[column].mode()[0], inplace=True)
117
- st.success("Filled missing values with the mode for categorical columns.")
118
- elif missing_strategy == "Do Nothing":
119
- st.info("No changes made to missing values.")
120
- clean7, clean8= st.columns(2)
121
- with clean7:
122
- # Display basic info after cleaning
123
- st.write(" Data Summary After Cleaning")
124
- st.write(data.describe())
125
- with clean8:
126
- st.write("Missing Values After Cleaning:")
127
- st.write(data.isnull().sum())
128
-
129
  with st.expander('4: EDA'):
130
-
131
- # Step 4: Exploratory Data Analysis (EDA)
132
- st.write("Correlation Matrix")
133
-
134
- # Calculate the correlation matrix
135
- correlation_matrix = data.corr()
136
-
137
- # Create a heatmap using Plotly
138
- fig = ff.create_annotated_heatmap(
139
- z=correlation_matrix.values,
140
- x=list(correlation_matrix.columns),
141
- y=list(correlation_matrix.index),
142
- )
143
-
144
- # Update layout for better readability
145
- fig.update_layout(
146
- title="Correlation Matrix",
147
- xaxis_title="Features",
148
- yaxis_title="Features",
149
- width=700, # Adjust width as needed
150
- height=500, # Adjust height as needed
151
- )
152
-
153
- # Display the figure in Streamlit
154
- st.plotly_chart(fig)
155
- eda1, eda2= st.columns(2)
156
- with eda1:
157
- # Plotting distributions for numerical features
158
- if st.checkbox("Show Distribution Plots for Numeric Features"):
159
- for column in data.select_dtypes(include=[int, float]).columns:
160
- fig, ax = plt.subplots(figsize=(8, 4))
161
- sns.histplot(data[column], bins=30, kde=True, ax=ax)
162
- plt.title(f'Distribution of {column}')
163
- st.pyplot(fig)
164
- with eda2:
165
- # Boxplots for outlier detection
166
- if st.checkbox("Show Boxplots for Numeric Features"):
167
- for column in data.select_dtypes(include=[int, float]).columns:
168
- fig, ax = plt.subplots(figsize=(8, 4))
169
- sns.boxplot(x=data[column], ax=ax)
170
- plt.title(f'Boxplot of {column}')
171
- st.pyplot(fig)
172
-
173
- with st.expander("5: Feature Engineering"):
174
- target_column = st.selectbox("Select the target variable", options=data.columns)
175
- feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column))
176
- with st.expander("6: Modelling "):
177
- # Initialize session state for storing results
178
- if 'model_plot' not in st.session_state:
179
- st.session_state.model_plot = None
180
- if 'model_metrics' not in st.session_state:
181
- st.session_state.model_metrics = None
182
-
183
- # Model training
184
- model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
185
-
186
- if st.button("Train Model (Without Hyperparameter Tuning)"):
187
- if feature_columns:
188
- X = data[feature_columns]
189
- y = data[target_column]
190
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
191
-
192
- # Initialize the selected model
193
- if model_option == "Linear Regression":
194
- model = LinearRegression()
195
- elif model_option == "Random Forest Regression":
196
- model = RandomForestRegressor(random_state=42)
197
- elif model_option == "Lasso Regression":
198
- model = Lasso()
199
-
200
- # Train model
201
- model.fit(X_train, y_train)
202
-
203
- # Save the model
204
- model_name = st.text_input('Enter model name', 'my_model')
205
- model_file_path = f'{model_name}.pkl'
206
- joblib.dump(model, model_file_path)
207
- st.success("Model saved successfully!")
208
-
209
- # Add a download button for the model
210
- with open(model_file_path, "rb") as f:
211
- st.download_button(
212
- label="Download Model",
213
- data=f,
214
- file_name=model_file_path,
215
- mime="application/octet-stream"
216
- )
217
-
218
- # Make predictions
219
- y_pred = model.predict(X_test)
220
-
221
- # Calculate metrics
222
- mse = mean_squared_error(y_test, y_pred)
223
- r2 = r2_score(y_test, y_pred)
224
-
225
- # Step 7: Visualization of Predictions (Line Plot)
226
- st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
227
- st.session_state.model_metrics = (mse, r2)
228
-
229
- # Show results
230
- st.success(f"Mean Squared Error: {mse:.2f}")
231
- st.success(f"R^2 Score: {r2:.2f}")
232
-
233
-
234
-
235
-
236
- # Display model plot if available
237
- if st.session_state.model_plot is not None:
238
- y_test, y_pred = st.session_state.model_plot
239
- fig, ax = plt.subplots(figsize=(10, 6))
240
- ax.plot(y_test, label="True Values", color="blue", linestyle="--")
241
- ax.plot(y_pred, label="Predicted Values", color="orange")
242
- ax.set_title(f'{model_option}: True Values vs Predictions')
243
- ax.set_xlabel('Index')
244
- ax.set_ylabel('Values')
245
- ax.legend()
246
  st.pyplot(fig)
247
 
248
- # Display metrics if available
249
- if st.session_state.model_metrics is not None:
250
- mse, r2 = st.session_state.model_metrics
251
- st.success(f"Mean Squared Error: {mse:.2f}")
252
- st.success(f"R^2 Score: {r2:.2f}")
253
 
 
 
 
 
 
254
 
255
- with st.expander("7: HyperParameter"):
256
- # Step 8: Hyperparameter Tuning
257
- st.write("Hyperparameter Tuning")
258
  if feature_columns:
259
- hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  if hyperparam_model_option == "Linear Regression":
262
- param_grid = {'fit_intercept': [True, False]}
 
263
  elif hyperparam_model_option == "Random Forest Regression":
264
- param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
 
265
  elif hyperparam_model_option == "Lasso Regression":
266
- param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]}
267
-
268
- if st.button("Train Model with Hyperparameter Tuning"):
269
- # Prepare data for training
270
- X = data[feature_columns]
271
- y = data[target_column]
272
-
273
- # Split data into training and testing sets
274
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
275
-
276
- # Initialize and perform hyperparameter tuning
277
- if hyperparam_model_option == "Linear Regression":
278
- model = LinearRegression()
279
- grid_search = GridSearchCV(model, param_grid, cv=5)
280
- elif hyperparam_model_option == "Random Forest Regression":
281
- model = RandomForestRegressor(random_state=42)
282
- grid_search = GridSearchCV(model, param_grid, cv=5)
283
- elif hyperparam_model_option == "Lasso Regression":
284
- model = Lasso()
285
- grid_search = GridSearchCV(model, param_grid, cv=5)
286
-
287
- # Train the model
288
- grid_search.fit(X_train, y_train)
289
-
290
- # Make predictions
291
- best_model = grid_search.best_estimator_
292
- y_pred = best_model.predict(X_test)
293
-
294
- # Calculate metrics
295
- mse = mean_squared_error(y_test, y_pred)
296
- r2 = r2_score(y_test, y_pred)
297
-
298
- # Step 9: Visualization of Predictions (Line Plot)
299
- st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
300
- st.session_state.model_metrics = (mse, r2)
301
-
302
- # Show results
303
- st.success(f"Best Parameters: {grid_search.best_params_}")
304
- st.success(f"Mean Squared Error: {mse:.2f}")
305
- st.success(f"R^2 Score: {r2:.2f}")
306
-
307
- # Display hyperparameter tuned model plot if available
308
- if st.session_state.model_plot is not None:
309
- y_test, y_pred = st.session_state.model_plot
310
- fig, ax = plt.subplots(figsize=(10, 6))
311
- ax.plot(y_test, label="True Values", color="blue", linestyle="--")
312
- ax.plot(y_pred, label="Predicted Values", color="orange")
313
- ax.set_title(f'{hyperparam_model_option}: True Values vs Predictions (Tuned)')
314
- ax.set_xlabel('Index')
315
- ax.set_ylabel('Values')
316
- ax.legend()
317
- st.pyplot(fig)
318
-
319
- # Display metrics if available
320
- if st.session_state.model_metrics is not None:
321
- mse, r2 = st.session_state.model_metrics
322
- st.success(f"Mean Squared Error: {mse:.2f}")
323
- st.success(f"R^2 Score: {r2:.2f}")
324
-
325
-
326
-
327
- # Run the app
328
- if __name__ == "__main__":
329
  main()
 
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import joblib
 
10
  import plotly.express as px
11
  import plotly.figure_factory as ff
12
 
 
17
 
18
  with st.expander("1: Add Your Data Source"):
19
  uploaded_file = st.file_uploader("Upload your CSV or Excel file", type=["csv", "xlsx", "xls"])
20
+
21
  if uploaded_file is None:
22
  try:
23
  data = pd.read_csv('example.csv') # Load example CSV
 
45
  except Exception as e:
46
  st.error(f"An error occurred: {e}")
47
 
 
48
  with st.expander("2: DataSet Preview"):
49
  if uploaded_file is not None:
50
+ st.write("Data Overview")
51
+ st.dataframe(data.head())
52
+ st.write("Data Description")
53
+ st.write(data.describe())
54
+ st.write("Missing Values")
55
+ st.write(data.isnull().sum())
56
+ st.write("Data Types")
57
+ st.write(data.dtypes)
 
 
 
 
 
 
 
 
 
 
58
 
59
  with st.expander("3: Data Cleaning"):
60
+ st.write("Data Summary Before Cleaning")
61
+ st.write(data.describe())
62
+ st.write("Missing Values Before Cleaning:")
63
+ st.write(data.isnull().sum())
64
+
65
+ if st.checkbox("Show Missing Values Heatmap"):
66
+ fig, ax = plt.subplots(figsize=(10, 6))
67
+ sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
68
+ plt.title("Missing Values Heatmap")
69
+ st.pyplot(fig)
70
+
71
+ if st.checkbox("Remove Duplicate Rows"):
72
+ initial_shape = data.shape
73
+ data = data.drop_duplicates()
74
+ st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.")
75
+
76
+ missing_strategy = st.selectbox(
77
+ "Choose a strategy for handling missing values",
78
+ options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"]
79
+ )
80
+
81
+ if st.button("Apply Missing Value Strategy"):
82
+ if missing_strategy == "Drop Missing Values":
83
+ data.dropna(inplace=True)
84
+ st.success("Dropped rows with missing values.")
85
+ elif missing_strategy == "Fill with Mean":
86
+ data.fillna(data.mean(), inplace=True)
87
+ st.success("Filled missing values with the mean.")
88
+ elif missing_strategy == "Fill with Median":
89
+ data.fillna(data.median(), inplace=True)
90
+ st.success("Filled missing values with the median.")
91
+ elif missing_strategy == "Fill with Mode":
92
+ for column in data.select_dtypes(include=['object']).columns:
93
+ data[column].fillna(data[column].mode()[0], inplace=True)
94
+ st.success("Filled missing values with the mode for categorical columns.")
95
+ elif missing_strategy == "Do Nothing":
96
+ st.info("No changes made to missing values.")
97
+
98
+ st.write("Data Summary After Cleaning")
99
+ st.write(data.describe())
100
+ st.write("Missing Values After Cleaning:")
101
+ st.write(data.isnull().sum())
102
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  with st.expander('4: EDA'):
104
+ st.write("Correlation Matrix")
105
+ correlation_matrix = data.corr()
106
+ fig = ff.create_annotated_heatmap(
107
+ z=correlation_matrix.values,
108
+ x=list(correlation_matrix.columns),
109
+ y=list(correlation_matrix.index),
110
+ )
111
+ fig.update_layout(
112
+ title="Correlation Matrix",
113
+ xaxis_title="Features",
114
+ yaxis_title="Features",
115
+ width=700,
116
+ height=500,
117
+ )
118
+ st.plotly_chart(fig)
119
+
120
+ if st.checkbox("Show Distribution Plots for Numeric Features"):
121
+ for column in data.select_dtypes(include=[int, float]).columns:
122
+ fig, ax = plt.subplots(figsize=(8, 4))
123
+ sns.histplot(data[column], bins=30, kde=True, ax=ax)
124
+ plt.title(f'Distribution of {column}')
125
+ st.pyplot(fig)
126
+
127
+ if st.checkbox("Show Boxplots for Numeric Features"):
128
+ for column in data.select_dtypes(include=[int, float]).columns:
129
+ fig, ax = plt.subplots(figsize=(8, 4))
130
+ sns.boxplot(x=data[column], ax=ax)
131
+ plt.title(f'Boxplot of {column}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  st.pyplot(fig)
133
 
134
+ with st.expander("5: Feature Engineering"):
135
+ target_column = st.selectbox("Select the target variable", options=data.columns)
136
+ feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column))
 
 
137
 
138
+ with st.expander("6: Modelling"):
139
+ if 'model_plot' not in st.session_state:
140
+ st.session_state.model_plot = None
141
+ if 'model_metrics' not in st.session_state:
142
+ st.session_state.model_metrics = None
143
 
144
+ model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
145
+
146
+ if st.button("Train Model (Without Hyperparameter Tuning)"):
147
  if feature_columns:
148
+ X = data[feature_columns]
149
+ y = data[target_column]
150
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
151
+
152
+ if model_option == "Linear Regression":
153
+ model = LinearRegression()
154
+ elif model_option == "Random Forest Regression":
155
+ model = RandomForestRegressor(random_state=42)
156
+ elif model_option == "Lasso Regression":
157
+ model = Lasso()
158
+
159
+ model.fit(X_train, y_train)
160
+
161
+ model_name = st.text_input('Enter model name', 'my_model')
162
+ model_file_path = f'{model_name}.pkl'
163
+ joblib.dump(model, model_file_path)
164
+ st.success("Model saved successfully!")
165
+
166
+ with open(model_file_path, "rb") as f:
167
+ st.download_button(
168
+ label="Download Model",
169
+ data=f,
170
+ file_name=model_file_path,
171
+ mime="application/octet-stream"
172
+ )
173
+
174
+ y_pred = model.predict(X_test)
175
+ mse = mean_squared_error(y_test, y_pred)
176
+ r2 = r2_score(y_test, y_pred)
177
+
178
+ st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
179
+ st.session_state.model_metrics = (mse, r2)
180
+
181
+ st.success(f"Mean Squared Error: {mse:.2f}")
182
+ st.success(f"R^2 Score: {r2:.2f}")
183
+
184
+ if st.session_state.model_plot is not None:
185
+ y_test, y_pred = st.session_state.model_plot
186
+ fig, ax = plt.subplots(figsize=(10, 6))
187
+ ax.plot(y_test, label="True Values", color="blue", linestyle="--")
188
+ ax.plot(y_pred, label="Predicted Values", color="orange")
189
+ ax.set_title(f'{model_option}: True Values vs Predictions')
190
+ ax.set_xlabel('Index')
191
+ ax.set_ylabel('Values')
192
+ ax.legend()
193
+ st.pyplot(fig)
194
+
195
+ if st.session_state.model_metrics is not None:
196
+ mse, r2 = st.session_state.model_metrics
197
+ st.success(f"Mean Squared Error: {mse:.2f}")
198
+ st.success(f"R^2 Score: {r2:.2f}")
199
+
200
+ with st.expander("7: HyperParameter"):
201
+ if feature_columns:
202
+ hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
203
+
204
+ if hyperparam_model_option == "Linear Regression":
205
+ param_grid = {'fit_intercept': [True, False]}
206
+ elif hyperparam_model_option == "Random Forest Regression":
207
+ param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
208
+ elif hyperparam_model_option == "Lasso Regression":
209
+ param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]}
210
+
211
+ if st.button("Train Model with Hyperparameter Tuning"):
212
+ X = data[feature_columns]
213
+ y = data[target_column]
214
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
215
 
216
  if hyperparam_model_option == "Linear Regression":
217
+ model = LinearRegression()
218
+ grid_search = GridSearchCV(model, param_grid, cv=5)
219
  elif hyperparam_model_option == "Random Forest Regression":
220
+ model = RandomForestRegressor(random_state=42)
221
+ grid_search = GridSearchCV(model, param_grid, cv=5)
222
  elif hyperparam_model_option == "Lasso Regression":
223
+ model = Lasso()
224
+ grid_search = GridSearchCV(model, param_grid, cv=5)
225
+
226
+ grid_search.fit(X_train, y_train)
227
+ best_params = grid_search.best_params_
228
+
229
+ st.success(f"Best Hyperparameters: {best_params}")
230
+
231
+ # Run the application
232
+ if __name__ == '__main__':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  main()