Surbhi commited on
Commit
a15c8e6
Β·
1 Parent(s): df0e756

Feature extraction and model training

Browse files
Files changed (1) hide show
  1. app.py +35 -73
app.py CHANGED
@@ -3,14 +3,13 @@ import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
 
6
  from sklearn.model_selection import train_test_split
7
  from sklearn.preprocessing import StandardScaler, LabelEncoder
8
  from sklearn.feature_selection import SelectKBest, f_classif
9
  from sklearn.impute import SimpleImputer
10
- from imblearn.over_sampling import SMOTE
11
  from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
12
-
13
- # Import ML Models
14
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
15
  from sklearn.svm import SVC, SVR
16
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
@@ -49,20 +48,9 @@ problems = {
49
 
50
  problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
51
 
52
- dataset_mapping = {name: f"datasets/{name.lower().replace(' ', '_')}.csv" for sublist in problems.values() for model in sublist for name in sublist[model]}
53
-
54
- # # Dataset Selection (User selects a pre-existing fake dataset)
55
- # dataset_mapping = {
56
- # "Spam Detection": "datasets/spam_detection.csv",
57
- # "Disease Prediction": "datasets/disease_prediction.csv",
58
- # "Image Recognition": "datasets/image_recognition.csv",
59
- # "Text Classification": "datasets/text_classification.csv",
60
- # "Fraud Detection": "datasets/fraud_detection.csv",
61
- # "Customer Segmentation": "datasets/customer_segmentation.csv",
62
- # "Loan Approval": "datasets/loan_approval.csv",
63
- # "House Price Prediction": "datasets/house_price_prediction.csv",
64
- # "Sales Forecasting": "datasets/sales_forecasting.csv",
65
- # }
66
 
67
  dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
68
  df = pd.read_csv(dataset_path)
@@ -92,8 +80,8 @@ for col in df.select_dtypes(include=['object']).columns:
92
  df[col] = label_encoders[col].fit_transform(df[col])
93
 
94
  # Split Data
95
- X = df.iloc[:, :-1] # Features
96
- y = df.iloc[:, -1] # Target
97
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
98
 
99
  # Feature Scaling
@@ -102,35 +90,31 @@ X_train = scaler.fit_transform(X_train)
102
  X_test = scaler.transform(X_test)
103
 
104
  # Feature Selection
105
- selector = SelectKBest(score_func=f_classif, k=min(5, X.shape[1])) # Ensure k does not exceed available features
106
  X_train = selector.fit_transform(X_train, y_train)
107
  X_test = selector.transform(X_test)
108
 
109
  # Handle imbalanced data
110
  if task == "Classification":
111
- smote = SMOTE()
112
- X_train, y_train = smote.fit_resample(X_train, y_train)
113
 
114
- # Model Initialization
115
- if task == "Classification":
116
- n_neighbors = min(5, len(y_train)) # Ensure k is valid
117
- model_mapping = {
118
- "KNN": KNeighborsClassifier(n_neighbors=n_neighbors),
119
- "SVM": SVC(),
120
- "Random Forest": RandomForestClassifier(),
121
- "Decision Tree": DecisionTreeClassifier(),
122
- "Perceptron": Perceptron()
123
- }
124
- else:
125
- n_neighbors = min(5, len(y_train)) # Ensure k is valid
126
- model_mapping = {
127
- "KNN": KNeighborsRegressor(n_neighbors=n_neighbors),
128
- "SVM": SVR(),
129
- "Random Forest": RandomForestRegressor(),
130
- "Decision Tree": DecisionTreeRegressor(),
131
- "Perceptron": Perceptron()
132
- }
133
 
 
 
 
 
 
 
 
 
 
134
  model_instance = model_mapping[model]
135
 
136
  # Train Model
@@ -139,52 +123,30 @@ y_pred = model_instance.predict(X_test)
139
 
140
  # Model Evaluation
141
  st.subheader("πŸ“Š Model Evaluation")
142
-
143
  if task == "Classification":
144
  accuracy = accuracy_score(y_test, y_pred)
145
  report = classification_report(y_test, y_pred, output_dict=True)
146
-
147
  st.write(f"**Accuracy:** {accuracy:.2f}")
148
- st.json(report) # Shows detailed structured metrics
149
-
150
- elif task == "Regression":
151
  mse = mean_squared_error(y_test, y_pred)
152
  mae = mean_absolute_error(y_test, y_pred)
153
  r2 = r2_score(y_test, y_pred)
154
-
155
- st.write(f"**Mean Squared Error (MSE):** {mse:.4f}")
156
- st.write(f"**Mean Absolute Error (MAE):** {mae:.4f}")
157
- st.write(f"**RΒ² Score:** {r2:.4f}")
158
 
159
  # Data Visualization
160
  st.subheader("πŸ“ˆ Data Visualization")
161
 
162
- # Heatmap
163
  st.write("### πŸ”₯ Feature Correlation")
164
- plt.figure(figsize=(8, 5))
165
- sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
166
- st.pyplot(plt)
167
-
168
- # Pair Plot
169
- st.write("### πŸ“Š Pair Plot of Features")
170
- sns.pairplot(df, diag_kind='kde')
171
- st.pyplot()
172
-
173
- # Feature Importance (for tree-based models)
174
- if model in ["Random Forest", "Decision Tree"]:
175
- feature_importances = model_instance.feature_importances_
176
- feature_names = X.columns
177
- importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances})
178
- importance_df = importance_df.sort_values(by="Importance", ascending=False)
179
-
180
  st.write("### 🌟 Feature Importance")
181
  fig, ax = plt.subplots()
182
  sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], ax=ax)
183
  st.pyplot(fig)
184
 
185
- # Download Code
186
- st.download_button("🐍 Download Python Code (.py)", "ai_model.py")
187
- st.download_button("πŸ““ Download Notebook (.ipynb)", "ai_model.ipynb")
188
- st.markdown("[πŸš€ Open in Colab](https://colab.research.google.com/)")
189
-
190
- st.success("Code generated! Download and do magic! ✨")
 
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
+ from collections import Counter
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.preprocessing import StandardScaler, LabelEncoder
9
  from sklearn.feature_selection import SelectKBest, f_classif
10
  from sklearn.impute import SimpleImputer
11
+ from imblearn.over_sampling import SMOTE, RandomOverSampler
12
  from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
 
 
13
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
14
  from sklearn.svm import SVC, SVR
15
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
48
 
49
  problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
50
 
51
+ # Dataset Mapping (Dynamic)
52
+ dataset_mapping = {name: f"datasets/{name.lower().replace(' ', '_')}.csv"
53
+ for sublist in problems.values() for model in sublist for name in sublist[model]}
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
56
  df = pd.read_csv(dataset_path)
 
80
  df[col] = label_encoders[col].fit_transform(df[col])
81
 
82
  # Split Data
83
+ X = df.iloc[:, :-1]
84
+ y = df.iloc[:, -1]
85
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
86
 
87
  # Feature Scaling
 
90
  X_test = scaler.transform(X_test)
91
 
92
  # Feature Selection
93
+ selector = SelectKBest(score_func=f_classif, k=min(X.shape[1], 5))
94
  X_train = selector.fit_transform(X_train, y_train)
95
  X_test = selector.transform(X_test)
96
 
97
  # Handle imbalanced data
98
  if task == "Classification":
99
+ class_counts = Counter(y_train)
100
+ min_class_samples = min(class_counts.values())
101
 
102
+ if min_class_samples > 5:
103
+ smote = SMOTE()
104
+ X_train, y_train = smote.fit_resample(X_train, y_train)
105
+ else:
106
+ ros = RandomOverSampler()
107
+ X_train, y_train = ros.fit_resample(X_train, y_train)
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ # Model Initialization
110
+ n_neighbors = min(5, len(y_train))
111
+ model_mapping = {
112
+ "KNN": KNeighborsClassifier(n_neighbors=n_neighbors) if task == "Classification" else KNeighborsRegressor(n_neighbors=n_neighbors),
113
+ "SVM": SVC() if task == "Classification" else SVR(),
114
+ "Random Forest": RandomForestClassifier() if task == "Classification" else RandomForestRegressor(),
115
+ "Decision Tree": DecisionTreeClassifier() if task == "Classification" else DecisionTreeRegressor(),
116
+ "Perceptron": Perceptron()
117
+ }
118
  model_instance = model_mapping[model]
119
 
120
  # Train Model
 
123
 
124
  # Model Evaluation
125
  st.subheader("πŸ“Š Model Evaluation")
 
126
  if task == "Classification":
127
  accuracy = accuracy_score(y_test, y_pred)
128
  report = classification_report(y_test, y_pred, output_dict=True)
 
129
  st.write(f"**Accuracy:** {accuracy:.2f}")
130
+ st.json(report)
131
+ else:
 
132
  mse = mean_squared_error(y_test, y_pred)
133
  mae = mean_absolute_error(y_test, y_pred)
134
  r2 = r2_score(y_test, y_pred)
135
+ st.write(f"**MSE:** {mse:.4f}, **MAE:** {mae:.4f}, **RΒ² Score:** {r2:.4f}")
 
 
 
136
 
137
  # Data Visualization
138
  st.subheader("πŸ“ˆ Data Visualization")
139
 
 
140
  st.write("### πŸ”₯ Feature Correlation")
141
+ fig, ax = plt.subplots(figsize=(8, 5))
142
+ sns.heatmap(df.corr(), annot=True, cmap="coolwarm", ax=ax)
143
+ st.pyplot(fig)
144
+
145
+ if model in ["Random Forest", "Decision Tree"] and hasattr(model_instance, "feature_importances_"):
146
+ importance_df = pd.DataFrame({"Feature": X.columns, "Importance": model_instance.feature_importances_}).sort_values(by="Importance", ascending=False)
 
 
 
 
 
 
 
 
 
 
147
  st.write("### 🌟 Feature Importance")
148
  fig, ax = plt.subplots()
149
  sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], ax=ax)
150
  st.pyplot(fig)
151
 
152
+ st.success("Code generated! πŸš€ Download & run!")