Surbhi commited on
Commit
df0e756
Β·
1 Parent(s): cedd211

Feature extraction and model training

Browse files
Files changed (1) hide show
  1. app.py +70 -24
app.py CHANGED
@@ -8,7 +8,7 @@ from sklearn.preprocessing import StandardScaler, LabelEncoder
8
  from sklearn.feature_selection import SelectKBest, f_classif
9
  from sklearn.impute import SimpleImputer
10
  from imblearn.over_sampling import SMOTE
11
- from sklearn.metrics import accuracy_score, classification_report
12
 
13
  # Import ML Models
14
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
@@ -49,14 +49,20 @@ problems = {
49
 
50
  problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
51
 
52
- # Dataset Selection (User selects a pre-existing fake dataset)
53
- dataset_mapping = {
54
- "Spam Detection": "datasets/spam_detection.csv",
55
- "Disease Prediction": "datasets/disease_prediction.csv",
56
- "Fraud Detection": "datasets/fraud_detection.csv",
57
- "House Price Prediction": "datasets/house_price.csv",
58
- "Sales Forecasting": "datasets/sales_forecasting.csv",
59
- }
 
 
 
 
 
 
60
 
61
  dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
62
  df = pd.read_csv(dataset_path)
@@ -96,7 +102,7 @@ X_train = scaler.fit_transform(X_train)
96
  X_test = scaler.transform(X_test)
97
 
98
  # Feature Selection
99
- selector = SelectKBest(score_func=f_classif, k=5)
100
  X_train = selector.fit_transform(X_train, y_train)
101
  X_test = selector.transform(X_test)
102
 
@@ -106,13 +112,24 @@ if task == "Classification":
106
  X_train, y_train = smote.fit_resample(X_train, y_train)
107
 
108
  # Model Initialization
109
- model_mapping = {
110
- "KNN": KNeighborsClassifier(n_neighbors=min(5, len(X_train))) if task == "Classification"
111
- else KNeighborsRegressor(n_neighbors=min(5, len(X_train))), "SVM": SVC() if task == "Classification" else SVR(),
112
- "Random Forest": RandomForestClassifier() if task == "Classification" else RandomForestRegressor(),
113
- "Decision Tree": DecisionTreeClassifier() if task == "Classification" else DecisionTreeRegressor(),
114
- "Perceptron": Perceptron() if task == "Classification" else Perceptron()
115
- }
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  model_instance = model_mapping[model]
118
 
@@ -120,22 +137,51 @@ model_instance = model_mapping[model]
120
  model_instance.fit(X_train, y_train)
121
  y_pred = model_instance.predict(X_test)
122
 
123
- # Evaluation Metrics
124
  st.subheader("πŸ“Š Model Evaluation")
 
125
  if task == "Classification":
126
  accuracy = accuracy_score(y_test, y_pred)
127
- report = classification_report(y_test, y_pred)
128
- st.write(f"**Accuracy:** {accuracy:.2f}")
129
- st.text(report)
130
- else:
131
- st.write("Regression evaluation metrics will be added soon!")
132
 
133
- # Visualization
 
 
 
 
 
 
 
 
 
 
 
 
134
  st.subheader("πŸ“ˆ Data Visualization")
 
 
 
135
  plt.figure(figsize=(8, 5))
136
  sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
137
  st.pyplot(plt)
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Download Code
140
  st.download_button("🐍 Download Python Code (.py)", "ai_model.py")
141
  st.download_button("πŸ““ Download Notebook (.ipynb)", "ai_model.ipynb")
 
8
  from sklearn.feature_selection import SelectKBest, f_classif
9
  from sklearn.impute import SimpleImputer
10
  from imblearn.over_sampling import SMOTE
11
+ from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
12
 
13
  # Import ML Models
14
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 
49
 
50
  problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
51
 
52
+ dataset_mapping = {name: f"datasets/{name.lower().replace(' ', '_')}.csv" for sublist in problems.values() for model in sublist for name in sublist[model]}
53
+
54
+ # # Dataset Selection (User selects a pre-existing fake dataset)
55
+ # dataset_mapping = {
56
+ # "Spam Detection": "datasets/spam_detection.csv",
57
+ # "Disease Prediction": "datasets/disease_prediction.csv",
58
+ # "Image Recognition": "datasets/image_recognition.csv",
59
+ # "Text Classification": "datasets/text_classification.csv",
60
+ # "Fraud Detection": "datasets/fraud_detection.csv",
61
+ # "Customer Segmentation": "datasets/customer_segmentation.csv",
62
+ # "Loan Approval": "datasets/loan_approval.csv",
63
+ # "House Price Prediction": "datasets/house_price_prediction.csv",
64
+ # "Sales Forecasting": "datasets/sales_forecasting.csv",
65
+ # }
66
 
67
  dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
68
  df = pd.read_csv(dataset_path)
 
102
  X_test = scaler.transform(X_test)
103
 
104
  # Feature Selection
105
+ selector = SelectKBest(score_func=f_classif, k=min(5, X.shape[1])) # Ensure k does not exceed available features
106
  X_train = selector.fit_transform(X_train, y_train)
107
  X_test = selector.transform(X_test)
108
 
 
112
  X_train, y_train = smote.fit_resample(X_train, y_train)
113
 
114
  # Model Initialization
115
+ if task == "Classification":
116
+ n_neighbors = min(5, len(y_train)) # Ensure k is valid
117
+ model_mapping = {
118
+ "KNN": KNeighborsClassifier(n_neighbors=n_neighbors),
119
+ "SVM": SVC(),
120
+ "Random Forest": RandomForestClassifier(),
121
+ "Decision Tree": DecisionTreeClassifier(),
122
+ "Perceptron": Perceptron()
123
+ }
124
+ else:
125
+ n_neighbors = min(5, len(y_train)) # Ensure k is valid
126
+ model_mapping = {
127
+ "KNN": KNeighborsRegressor(n_neighbors=n_neighbors),
128
+ "SVM": SVR(),
129
+ "Random Forest": RandomForestRegressor(),
130
+ "Decision Tree": DecisionTreeRegressor(),
131
+ "Perceptron": Perceptron()
132
+ }
133
 
134
  model_instance = model_mapping[model]
135
 
 
137
  model_instance.fit(X_train, y_train)
138
  y_pred = model_instance.predict(X_test)
139
 
140
+ # Model Evaluation
141
  st.subheader("πŸ“Š Model Evaluation")
142
+
143
  if task == "Classification":
144
  accuracy = accuracy_score(y_test, y_pred)
145
+ report = classification_report(y_test, y_pred, output_dict=True)
 
 
 
 
146
 
147
+ st.write(f"**Accuracy:** {accuracy:.2f}")
148
+ st.json(report) # Shows detailed structured metrics
149
+
150
+ elif task == "Regression":
151
+ mse = mean_squared_error(y_test, y_pred)
152
+ mae = mean_absolute_error(y_test, y_pred)
153
+ r2 = r2_score(y_test, y_pred)
154
+
155
+ st.write(f"**Mean Squared Error (MSE):** {mse:.4f}")
156
+ st.write(f"**Mean Absolute Error (MAE):** {mae:.4f}")
157
+ st.write(f"**RΒ² Score:** {r2:.4f}")
158
+
159
+ # Data Visualization
160
  st.subheader("πŸ“ˆ Data Visualization")
161
+
162
+ # Heatmap
163
+ st.write("### πŸ”₯ Feature Correlation")
164
  plt.figure(figsize=(8, 5))
165
  sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
166
  st.pyplot(plt)
167
 
168
+ # Pair Plot
169
+ st.write("### πŸ“Š Pair Plot of Features")
170
+ sns.pairplot(df, diag_kind='kde')
171
+ st.pyplot()
172
+
173
+ # Feature Importance (for tree-based models)
174
+ if model in ["Random Forest", "Decision Tree"]:
175
+ feature_importances = model_instance.feature_importances_
176
+ feature_names = X.columns
177
+ importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances})
178
+ importance_df = importance_df.sort_values(by="Importance", ascending=False)
179
+
180
+ st.write("### 🌟 Feature Importance")
181
+ fig, ax = plt.subplots()
182
+ sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], ax=ax)
183
+ st.pyplot(fig)
184
+
185
  # Download Code
186
  st.download_button("🐍 Download Python Code (.py)", "ai_model.py")
187
  st.download_button("πŸ““ Download Notebook (.ipynb)", "ai_model.ipynb")