destiratnakomala commited on
Commit
8ff1bfe
·
verified ·
1 Parent(s): 5143653

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -275
app.py CHANGED
@@ -7,7 +7,6 @@ from sklearn.linear_model import LinearRegression, Lasso
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import joblib
10
- import streamlit as st
11
  import plotly.express as px
12
  import plotly.figure_factory as ff
13
 
@@ -17,295 +16,221 @@ def main():
17
  st.title("Machine Learning")
18
 
19
  with st.expander("1: Add Your Data Source"):
20
- uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
21
-
22
- # If no file is uploaded, load example.csv
23
- if uploaded_file is None:
24
- try:
25
- data = pd.read_csv('example.csv') # Load example CSV
26
- st.info("Loaded example.csv")
27
- except FileNotFoundError:
28
- st.error("Example CSV file not found. Please upload your own CSV file.")
29
- else:
30
- data = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  with st.expander("2: DataSet Preview"):
33
  if uploaded_file is not None:
34
  data = pd.read_csv(uploaded_file)
35
  else:
36
- data= pd.read_csv('example.csv')
37
- # Step 2: Data Overview
38
- view1, view2,view3, view4 = st.columns(4)
39
- with view1:
40
- st.write("Data Overview")
41
- st.dataframe(data.head())
42
- with view2:
43
- st.write(" Data Description")
44
- st.write(data.describe())
45
- with view3:
46
- st.write(" Missing Values")
47
- st.write(data.isnull().sum())
48
- with view4:
49
- st.write(" Data Types")
50
- st.write(data.dtypes)
51
-
52
 
53
  with st.expander("3: Data Cleaning"):
54
- # Step 3: Data Cleaning
55
- clean1, clean2, clean3 = st.columns(3)
56
- with clean1:
57
- st.write(" Data Summary Before Cleaning")
58
- st.write(data.describe())
59
- with clean2:
60
- st.write("Missing Values Before Cleaning:")
61
- st.write(data.isnull().sum())
62
- with clean3:
63
- # Visualize missing values
64
- if st.checkbox("Show Missing Values Heatmap"):
65
- fig, ax = plt.subplots(figsize=(10, 6))
66
- sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
67
- plt.title("Missing Values Heatmap")
68
- st.pyplot(fig)
69
-
70
- clean4, clean5= st.columns(2)
71
- with clean4:
72
- # Remove duplicates
73
- if st.checkbox("Remove Duplicate Rows"):
74
- initial_shape = data.shape
75
- data = data.drop_duplicates()
76
- st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.")
77
-
78
-
79
- with clean5:
80
- # Handle missing values
81
- missing_strategy = st.selectbox(
82
- "Choose a strategy for handling missing values",
83
- options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"]
84
- )
85
-
86
- if st.button("Apply Missing Value Strategy"):
87
- if missing_strategy == "Drop Missing Values":
88
- data.dropna(inplace=True)
89
- st.success("Dropped rows with missing values.")
90
- elif missing_strategy == "Fill with Mean":
91
- data.fillna(data.mean(), inplace=True)
92
- st.success("Filled missing values with the mean.")
93
- elif missing_strategy == "Fill with Median":
94
- data.fillna(data.median(), inplace=True)
95
- st.success("Filled missing values with the median.")
96
- elif missing_strategy == "Fill with Mode":
97
- for column in data.select_dtypes(include=['object']).columns:
98
- data[column].fillna(data[column].mode()[0], inplace=True)
99
- st.success("Filled missing values with the mode for categorical columns.")
100
- elif missing_strategy == "Do Nothing":
101
- st.info("No changes made to missing values.")
102
- clean7, clean8= st.columns(2)
103
- with clean7:
104
- # Display basic info after cleaning
105
- st.write(" Data Summary After Cleaning")
106
- st.write(data.describe())
107
- with clean8:
108
- st.write("Missing Values After Cleaning:")
109
- st.write(data.isnull().sum())
110
-
111
  with st.expander('4: EDA'):
112
-
113
- # Step 4: Exploratory Data Analysis (EDA)
114
- st.write("Correlation Matrix")
115
-
116
- # Calculate the correlation matrix
117
- correlation_matrix = data.corr()
118
-
119
- # Create a heatmap using Plotly
120
- fig = ff.create_annotated_heatmap(
121
- z=correlation_matrix.values,
122
- x=list(correlation_matrix.columns),
123
- y=list(correlation_matrix.index),
124
- )
125
-
126
- # Update layout for better readability
127
- fig.update_layout(
128
- title="Correlation Matrix",
129
- xaxis_title="Features",
130
- yaxis_title="Features",
131
- width=700, # Adjust width as needed
132
- height=500, # Adjust height as needed
133
- )
134
-
135
- # Display the figure in Streamlit
136
- st.plotly_chart(fig)
137
- eda1, eda2= st.columns(2)
138
- with eda1:
139
- # Plotting distributions for numerical features
140
- if st.checkbox("Show Distribution Plots for Numeric Features"):
141
- for column in data.select_dtypes(include=[int, float]).columns:
142
- fig, ax = plt.subplots(figsize=(8, 4))
143
- sns.histplot(data[column], bins=30, kde=True, ax=ax)
144
- plt.title(f'Distribution of {column}')
145
- st.pyplot(fig)
146
- with eda2:
147
- # Boxplots for outlier detection
148
- if st.checkbox("Show Boxplots for Numeric Features"):
149
- for column in data.select_dtypes(include=[int, float]).columns:
150
- fig, ax = plt.subplots(figsize=(8, 4))
151
- sns.boxplot(x=data[column], ax=ax)
152
- plt.title(f'Boxplot of {column}')
153
- st.pyplot(fig)
154
-
155
- with st.expander("5: Feature Engineering"):
156
- target_column = st.selectbox("Select the target variable", options=data.columns)
157
- feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column))
158
- with st.expander("6: Modelling "):
159
- # Initialize session state for storing results
160
- if 'model_plot' not in st.session_state:
161
- st.session_state.model_plot = None
162
- if 'model_metrics' not in st.session_state:
163
- st.session_state.model_metrics = None
164
-
165
- # Model training
166
- model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
167
-
168
- if st.button("Train Model (Without Hyperparameter Tuning)"):
169
- if feature_columns:
170
- X = data[feature_columns]
171
- y = data[target_column]
172
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
173
-
174
- # Initialize the selected model
175
- if model_option == "Linear Regression":
176
- model = LinearRegression()
177
- elif model_option == "Random Forest Regression":
178
- model = RandomForestRegressor(random_state=42)
179
- elif model_option == "Lasso Regression":
180
- model = Lasso()
181
-
182
- # Train model
183
- model.fit(X_train, y_train)
184
-
185
- # Save the model
186
- model_name = st.text_input('Enter model name', 'my_model')
187
- model_file_path = f'{model_name}.pkl'
188
- joblib.dump(model, model_file_path)
189
- st.success("Model saved successfully!")
190
-
191
- # Add a download button for the model
192
- with open(model_file_path, "rb") as f:
193
- st.download_button(
194
- label="Download Model",
195
- data=f,
196
- file_name=model_file_path,
197
- mime="application/octet-stream"
198
- )
199
-
200
- # Make predictions
201
- y_pred = model.predict(X_test)
202
-
203
- # Calculate metrics
204
- mse = mean_squared_error(y_test, y_pred)
205
- r2 = r2_score(y_test, y_pred)
206
-
207
- # Step 7: Visualization of Predictions (Line Plot)
208
- st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
209
- st.session_state.model_metrics = (mse, r2)
210
-
211
- # Show results
212
- st.success(f"Mean Squared Error: {mse:.2f}")
213
- st.success(f"R^2 Score: {r2:.2f}")
214
-
215
-
216
-
217
-
218
- # Display model plot if available
219
- if st.session_state.model_plot is not None:
220
- y_test, y_pred = st.session_state.model_plot
221
- fig, ax = plt.subplots(figsize=(10, 6))
222
- ax.plot(y_test, label="True Values", color="blue", linestyle="--")
223
- ax.plot(y_pred, label="Predicted Values", color="orange")
224
- ax.set_title(f'{model_option}: True Values vs Predictions')
225
- ax.set_xlabel('Index')
226
- ax.set_ylabel('Values')
227
- ax.legend()
228
  st.pyplot(fig)
229
 
230
- # Display metrics if available
231
- if st.session_state.model_metrics is not None:
232
- mse, r2 = st.session_state.model_metrics
233
- st.success(f"Mean Squared Error: {mse:.2f}")
234
- st.success(f"R^2 Score: {r2:.2f}")
 
235
 
 
 
 
236
 
237
- with st.expander("7: HyperParameter"):
238
- # Step 8: Hyperparameter Tuning
239
- st.write("Hyperparameter Tuning")
 
 
 
 
 
 
240
  if feature_columns:
241
- hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  if hyperparam_model_option == "Linear Regression":
244
- param_grid = {'fit_intercept': [True, False]}
 
245
  elif hyperparam_model_option == "Random Forest Regression":
246
- param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
 
247
  elif hyperparam_model_option == "Lasso Regression":
248
- param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]}
249
-
250
- if st.button("Train Model with Hyperparameter Tuning"):
251
- # Prepare data for training
252
- X = data[feature_columns]
253
- y = data[target_column]
254
-
255
- # Split data into training and testing sets
256
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
257
-
258
- # Initialize and perform hyperparameter tuning
259
- if hyperparam_model_option == "Linear Regression":
260
- model = LinearRegression()
261
- grid_search = GridSearchCV(model, param_grid, cv=5)
262
- elif hyperparam_model_option == "Random Forest Regression":
263
- model = RandomForestRegressor(random_state=42)
264
- grid_search = GridSearchCV(model, param_grid, cv=5)
265
- elif hyperparam_model_option == "Lasso Regression":
266
- model = Lasso()
267
- grid_search = GridSearchCV(model, param_grid, cv=5)
268
-
269
- # Train the model
270
- grid_search.fit(X_train, y_train)
271
-
272
- # Make predictions
273
- best_model = grid_search.best_estimator_
274
- y_pred = best_model.predict(X_test)
275
-
276
- # Calculate metrics
277
- mse = mean_squared_error(y_test, y_pred)
278
- r2 = r2_score(y_test, y_pred)
279
-
280
- # Step 9: Visualization of Predictions (Line Plot)
281
- st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
282
- st.session_state.model_metrics = (mse, r2)
283
-
284
- # Show results
285
- st.success(f"Best Parameters: {grid_search.best_params_}")
286
- st.success(f"Mean Squared Error: {mse:.2f}")
287
- st.success(f"R^2 Score: {r2:.2f}")
288
-
289
- # Display hyperparameter tuned model plot if available
290
- if st.session_state.model_plot is not None:
291
- y_test, y_pred = st.session_state.model_plot
292
- fig, ax = plt.subplots(figsize=(10, 6))
293
- ax.plot(y_test, label="True Values", color="blue", linestyle="--")
294
- ax.plot(y_pred, label="Predicted Values", color="orange")
295
- ax.set_title(f'{hyperparam_model_option}: True Values vs Predictions (Tuned)')
296
- ax.set_xlabel('Index')
297
- ax.set_ylabel('Values')
298
- ax.legend()
299
- st.pyplot(fig)
300
-
301
- # Display metrics if available
302
- if st.session_state.model_metrics is not None:
303
- mse, r2 = st.session_state.model_metrics
304
- st.success(f"Mean Squared Error: {mse:.2f}")
305
- st.success(f"R^2 Score: {r2:.2f}")
306
-
307
-
308
-
309
- # Run the app
310
- if __name__ == "__main__":
311
  main()
 
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import joblib
 
10
  import plotly.express as px
11
  import plotly.figure_factory as ff
12
 
 
16
  st.title("Machine Learning")
17
 
18
  with st.expander("1: Add Your Data Source"):
19
+ uploaded_file = st.file_uploader("Upload your CSV or Excel file", type=["csv", "xlsx", "xls"])
20
+
21
+ if uploaded_file is None:
22
+ try:
23
+ data = pd.read_csv('example.csv') # Load example CSV
24
+ st.info("Loaded example.csv")
25
+ except FileNotFoundError:
26
+ st.error("Example CSV file not found. Please upload your own CSV or Excel file.")
27
+ except pd.errors.EmptyDataError:
28
+ st.error("Example CSV file is empty or invalid.")
29
+ else:
30
+ try:
31
+ if uploaded_file.name.endswith('.csv'):
32
+ data = pd.read_csv(uploaded_file)
33
+ elif uploaded_file.name.endswith(('.xlsx', '.xls')):
34
+ data = pd.read_excel(uploaded_file)
35
+
36
+ # Check if the file has content
37
+ if data.empty:
38
+ st.error("Uploaded file is empty. Please upload a valid CSV or Excel file.")
39
+ else:
40
+ st.success("File uploaded successfully!")
41
+ except pd.errors.EmptyDataError:
42
+ st.error("The uploaded file is empty or contains no readable data.")
43
+ except ValueError:
44
+ st.error("Error in file format. Please ensure the file is a valid CSV or Excel.")
45
+ except Exception as e:
46
+ st.error(f"An error occurred: {e}")
47
 
48
  with st.expander("2: DataSet Preview"):
49
  if uploaded_file is not None:
50
  data = pd.read_csv(uploaded_file)
51
  else:
52
+ data = pd.read_csv('example.csv')
53
+ st.write("Data Overview")
54
+ st.dataframe(data.head())
55
+ st.write("Data Description")
56
+ st.write(data.describe())
57
+ st.write("Missing Values")
58
+ st.write(data.isnull().sum())
59
+ st.write("Data Types")
60
+ st.write(data.dtypes)
 
 
 
 
 
 
 
61
 
62
  with st.expander("3: Data Cleaning"):
63
+ st.write("Data Summary Before Cleaning")
64
+ st.write(data.describe())
65
+ st.write("Missing Values Before Cleaning:")
66
+ st.write(data.isnull().sum())
67
+
68
+ if st.checkbox("Show Missing Values Heatmap"):
69
+ fig, ax = plt.subplots(figsize=(10, 6))
70
+ sns.heatmap(data.isnull(), cbar=False, cmap='viridis', ax=ax)
71
+ plt.title("Missing Values Heatmap")
72
+ st.pyplot(fig)
73
+
74
+ if st.checkbox("Remove Duplicate Rows"):
75
+ initial_shape = data.shape
76
+ data = data.drop_duplicates()
77
+ st.success(f"Removed {initial_shape[0] - data.shape[0]} duplicate rows.")
78
+
79
+ missing_strategy = st.selectbox(
80
+ "Choose a strategy for handling missing values",
81
+ options=["Drop Missing Values", "Fill with Mean", "Fill with Median", "Fill with Mode", "Do Nothing"]
82
+ )
83
+
84
+ if st.button("Apply Missing Value Strategy"):
85
+ if missing_strategy == "Drop Missing Values":
86
+ data.dropna(inplace=True)
87
+ st.success("Dropped rows with missing values.")
88
+ elif missing_strategy == "Fill with Mean":
89
+ data.fillna(data.mean(), inplace=True)
90
+ st.success("Filled missing values with the mean.")
91
+ elif missing_strategy == "Fill with Median":
92
+ data.fillna(data.median(), inplace=True)
93
+ st.success("Filled missing values with the median.")
94
+ elif missing_strategy == "Fill with Mode":
95
+ for column in data.select_dtypes(include=['object']).columns:
96
+ data[column].fillna(data[column].mode()[0], inplace=True)
97
+ st.success("Filled missing values with the mode for categorical columns.")
98
+ elif missing_strategy == "Do Nothing":
99
+ st.info("No changes made to missing values.")
100
+
101
+ st.write("Data Summary After Cleaning")
102
+ st.write(data.describe())
103
+ st.write("Missing Values After Cleaning:")
104
+ st.write(data.isnull().sum())
105
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  with st.expander('4: EDA'):
107
+ st.write("Correlation Matrix")
108
+ correlation_matrix = data.corr()
109
+ fig = ff.create_annotated_heatmap(
110
+ z=correlation_matrix.values,
111
+ x=list(correlation_matrix.columns),
112
+ y=list(correlation_matrix.index),
113
+ )
114
+ fig.update_layout(
115
+ title="Correlation Matrix",
116
+ xaxis_title="Features",
117
+ yaxis_title="Features",
118
+ width=700,
119
+ height=500,
120
+ )
121
+ st.plotly_chart(fig)
122
+
123
+ if st.checkbox("Show Distribution Plots for Numeric Features"):
124
+ for column in data.select_dtypes(include=[int, float]).columns:
125
+ fig, ax = plt.subplots(figsize=(8, 4))
126
+ sns.histplot(data[column], bins=30, kde=True, ax=ax)
127
+ plt.title(f'Distribution of {column}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  st.pyplot(fig)
129
 
130
+ if st.checkbox("Show Boxplots for Numeric Features"):
131
+ for column in data.select_dtypes(include=[int, float]).columns:
132
+ fig, ax = plt.subplots(figsize=(8, 4))
133
+ sns.boxplot(x=data[column], ax=ax)
134
+ plt.title(f'Boxplot of {column}')
135
+ st.pyplot(fig)
136
 
137
+ with st.expander("5: Feature Engineering"):
138
+ target_column = st.selectbox("Select the target variable", options=data.columns)
139
+ feature_columns = st.multiselect("Select features", options=data.columns.drop(target_column))
140
 
141
+ with st.expander("6: Modelling"):
142
+ if 'model_plot' not in st.session_state:
143
+ st.session_state.model_plot = None
144
+ if 'model_metrics' not in st.session_state:
145
+ st.session_state.model_metrics = None
146
+
147
+ model_option = st.selectbox("Select Regression Model", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
148
+
149
+ if st.button("Train Model (Without Hyperparameter Tuning)"):
150
  if feature_columns:
151
+ X = data[feature_columns]
152
+ y = data[target_column]
153
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
154
+
155
+ if model_option == "Linear Regression":
156
+ model = LinearRegression()
157
+ elif model_option == "Random Forest Regression":
158
+ model = RandomForestRegressor(random_state=42)
159
+ elif model_option == "Lasso Regression":
160
+ model = Lasso()
161
+
162
+ model.fit(X_train, y_train)
163
+
164
+ model_name = st.text_input('Enter model name', 'my_model')
165
+ model_file_path = f'{model_name}.pkl'
166
+ joblib.dump(model, model_file_path)
167
+ st.success("Model saved successfully!")
168
+
169
+ with open(model_file_path, "rb") as f:
170
+ st.download_button(
171
+ label="Download Model",
172
+ data=f,
173
+ file_name=model_file_path,
174
+ mime="application/octet-stream"
175
+ )
176
+
177
+ y_pred = model.predict(X_test)
178
+ mse = mean_squared_error(y_test, y_pred)
179
+ r2 = r2_score(y_test, y_pred)
180
+
181
+ st.session_state.model_plot = (y_test.reset_index(drop=True), y_pred)
182
+ st.session_state.model_metrics = (mse, r2)
183
+
184
+ st.success(f"Mean Squared Error: {mse:.2f}")
185
+ st.success(f"R^2 Score: {r2:.2f}")
186
+
187
+ if st.session_state.model_plot is not None:
188
+ y_test, y_pred = st.session_state.model_plot
189
+ fig, ax = plt.subplots(figsize=(10, 6))
190
+ ax.plot(y_test, label="True Values", color="blue", linestyle="--")
191
+ ax.plot(y_pred, label="Predicted Values", color="orange")
192
+ ax.set_title(f'{model_option}: True Values vs Predictions')
193
+ ax.set_xlabel('Index')
194
+ ax.set_ylabel('Values')
195
+ ax.legend()
196
+ st.pyplot(fig)
197
+
198
+ if st.session_state.model_metrics is not None:
199
+ mse, r2 = st.session_state.model_metrics
200
+ st.success(f"Mean Squared Error: {mse:.2f}")
201
+ st.success(f"R^2 Score: {r2:.2f}")
202
+
203
+ with st.expander("7: HyperParameter"):
204
+ if feature_columns:
205
+ hyperparam_model_option = st.selectbox("Select Model for Hyperparameter Tuning", options=["Linear Regression", "Random Forest Regression", "Lasso Regression"])
206
+
207
+ if hyperparam_model_option == "Linear Regression":
208
+ param_grid = {'fit_intercept': [True, False]}
209
+ elif hyperparam_model_option == "Random Forest Regression":
210
+ param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
211
+ elif hyperparam_model_option == "Lasso Regression":
212
+ param_grid = {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 5000, 10000]}
213
+
214
+ if st.button("Train Model with Hyperparameter Tuning"):
215
+ X = data[feature_columns]
216
+ y = data[target_column]
217
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
218
 
219
  if hyperparam_model_option == "Linear Regression":
220
+ model = LinearRegression()
221
+ grid_search = GridSearchCV(model, param_grid, cv=5)
222
  elif hyperparam_model_option == "Random Forest Regression":
223
+ model = RandomForestRegressor(random_state=42)
224
+ grid_search = GridSearchCV(model, param_grid, cv=5)
225
  elif hyperparam_model_option == "Lasso Regression":
226
+ model = Lasso()
227
+ grid_search = GridSearchCV(model, param_grid, cv=5)
228
+
229
+ grid_search.fit(X_train, y_train)
230
+ best_params = grid_search.best_params_
231
+
232
+ st.success(f"Best Hyperparameters: {best_params}")
233
+
234
+ # Run the application
235
+ if __name__ == '__main__':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  main()