Surbhi commited on
Commit
f28fb28
Β·
1 Parent(s): a15c8e6

Feature extraction and model training

Browse files
Files changed (1) hide show
  1. app.py +74 -23
app.py CHANGED
@@ -3,13 +3,15 @@ import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
- from collections import Counter
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.preprocessing import StandardScaler, LabelEncoder
9
  from sklearn.feature_selection import SelectKBest, f_classif
10
  from sklearn.impute import SimpleImputer
11
- from imblearn.over_sampling import SMOTE, RandomOverSampler
12
  from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
 
 
13
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
14
  from sklearn.svm import SVC, SVR
15
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
@@ -48,9 +50,18 @@ problems = {
48
 
49
  problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
50
 
51
- # Dataset Mapping (Dynamic)
52
- dataset_mapping = {name: f"datasets/{name.lower().replace(' ', '_')}.csv"
53
- for sublist in problems.values() for model in sublist for name in sublist[model]}
 
 
 
 
 
 
 
 
 
54
 
55
  dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
56
  df = pd.read_csv(dataset_path)
@@ -80,8 +91,8 @@ for col in df.select_dtypes(include=['object']).columns:
80
  df[col] = label_encoders[col].fit_transform(df[col])
81
 
82
  # Split Data
83
- X = df.iloc[:, :-1]
84
- y = df.iloc[:, -1]
85
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
86
 
87
  # Feature Scaling
@@ -90,31 +101,25 @@ X_train = scaler.fit_transform(X_train)
90
  X_test = scaler.transform(X_test)
91
 
92
  # Feature Selection
93
- selector = SelectKBest(score_func=f_classif, k=min(X.shape[1], 5))
94
  X_train = selector.fit_transform(X_train, y_train)
95
  X_test = selector.transform(X_test)
96
 
97
- # Handle imbalanced data
98
  if task == "Classification":
99
- class_counts = Counter(y_train)
100
- min_class_samples = min(class_counts.values())
101
-
102
- if min_class_samples > 5:
103
  smote = SMOTE()
104
  X_train, y_train = smote.fit_resample(X_train, y_train)
105
- else:
106
- ros = RandomOverSampler()
107
- X_train, y_train = ros.fit_resample(X_train, y_train)
108
 
109
  # Model Initialization
110
- n_neighbors = min(5, len(y_train))
111
  model_mapping = {
112
- "KNN": KNeighborsClassifier(n_neighbors=n_neighbors) if task == "Classification" else KNeighborsRegressor(n_neighbors=n_neighbors),
113
  "SVM": SVC() if task == "Classification" else SVR(),
114
  "Random Forest": RandomForestClassifier() if task == "Classification" else RandomForestRegressor(),
115
  "Decision Tree": DecisionTreeClassifier() if task == "Classification" else DecisionTreeRegressor(),
116
  "Perceptron": Perceptron()
117
  }
 
118
  model_instance = model_mapping[model]
119
 
120
  # Train Model
@@ -123,30 +128,76 @@ y_pred = model_instance.predict(X_test)
123
 
124
  # Model Evaluation
125
  st.subheader("πŸ“Š Model Evaluation")
 
126
  if task == "Classification":
127
  accuracy = accuracy_score(y_test, y_pred)
128
  report = classification_report(y_test, y_pred, output_dict=True)
129
  st.write(f"**Accuracy:** {accuracy:.2f}")
130
  st.json(report)
131
- else:
 
132
  mse = mean_squared_error(y_test, y_pred)
133
  mae = mean_absolute_error(y_test, y_pred)
134
  r2 = r2_score(y_test, y_pred)
135
- st.write(f"**MSE:** {mse:.4f}, **MAE:** {mae:.4f}, **RΒ² Score:** {r2:.4f}")
 
 
136
 
137
  # Data Visualization
138
  st.subheader("πŸ“ˆ Data Visualization")
139
 
 
140
  st.write("### πŸ”₯ Feature Correlation")
141
  fig, ax = plt.subplots(figsize=(8, 5))
142
  sns.heatmap(df.corr(), annot=True, cmap="coolwarm", ax=ax)
143
  st.pyplot(fig)
144
 
145
- if model in ["Random Forest", "Decision Tree"] and hasattr(model_instance, "feature_importances_"):
146
- importance_df = pd.DataFrame({"Feature": X.columns, "Importance": model_instance.feature_importances_}).sort_values(by="Importance", ascending=False)
 
 
 
 
147
  st.write("### 🌟 Feature Importance")
148
  fig, ax = plt.subplots()
149
  sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], ax=ax)
150
  st.pyplot(fig)
151
 
152
- st.success("Code generated! πŸš€ Download & run!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
+ import json
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.preprocessing import StandardScaler, LabelEncoder
9
  from sklearn.feature_selection import SelectKBest, f_classif
10
  from sklearn.impute import SimpleImputer
11
+ from imblearn.over_sampling import SMOTE
12
  from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
13
+
14
+ # Import ML Models
15
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
16
  from sklearn.svm import SVC, SVR
17
  from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
50
 
51
  problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
52
 
53
+ # Dataset Selection (Simulated dataset paths)
54
+ dataset_mapping = {
55
+ "Spam Detection": "datasets/spam_detection.csv",
56
+ "Disease Prediction": "datasets/disease_prediction.csv",
57
+ "Image Recognition": "datasets/image_recognition.csv",
58
+ "Text Classification": "datasets/text_classification.csv",
59
+ "Fraud Detection": "datasets/fraud_detection.csv",
60
+ "Customer Segmentation": "datasets/customer_segmentation.csv",
61
+ "Loan Approval": "datasets/loan_approval.csv",
62
+ "House Price Prediction": "datasets/house_price_prediction.csv",
63
+ "Sales Forecasting": "datasets/sales_forecasting.csv",
64
+ }
65
 
66
  dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
67
  df = pd.read_csv(dataset_path)
 
91
  df[col] = label_encoders[col].fit_transform(df[col])
92
 
93
  # Split Data
94
+ X = df.iloc[:, :-1] # Features
95
+ y = df.iloc[:, -1] # Target
96
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
97
 
98
  # Feature Scaling
 
101
  X_test = scaler.transform(X_test)
102
 
103
  # Feature Selection
104
+ selector = SelectKBest(score_func=f_classif, k=min(5, X.shape[1])) # Ensure k does not exceed available features
105
  X_train = selector.fit_transform(X_train, y_train)
106
  X_test = selector.transform(X_test)
107
 
108
+ # Handle imbalanced data (only for classification)
109
  if task == "Classification":
110
+ if len(set(y_train)) > 1 and len(y_train) > 5: # Avoid SMOTE errors
 
 
 
111
  smote = SMOTE()
112
  X_train, y_train = smote.fit_resample(X_train, y_train)
 
 
 
113
 
114
  # Model Initialization
 
115
  model_mapping = {
116
+ "KNN": KNeighborsClassifier() if task == "Classification" else KNeighborsRegressor(),
117
  "SVM": SVC() if task == "Classification" else SVR(),
118
  "Random Forest": RandomForestClassifier() if task == "Classification" else RandomForestRegressor(),
119
  "Decision Tree": DecisionTreeClassifier() if task == "Classification" else DecisionTreeRegressor(),
120
  "Perceptron": Perceptron()
121
  }
122
+
123
  model_instance = model_mapping[model]
124
 
125
  # Train Model
 
128
 
129
  # Model Evaluation
130
  st.subheader("πŸ“Š Model Evaluation")
131
+
132
  if task == "Classification":
133
  accuracy = accuracy_score(y_test, y_pred)
134
  report = classification_report(y_test, y_pred, output_dict=True)
135
  st.write(f"**Accuracy:** {accuracy:.2f}")
136
  st.json(report)
137
+
138
+ elif task == "Regression":
139
  mse = mean_squared_error(y_test, y_pred)
140
  mae = mean_absolute_error(y_test, y_pred)
141
  r2 = r2_score(y_test, y_pred)
142
+ st.write(f"**Mean Squared Error (MSE):** {mse:.4f}")
143
+ st.write(f"**Mean Absolute Error (MAE):** {mae:.4f}")
144
+ st.write(f"**RΒ² Score:** {r2:.4f}")
145
 
146
  # Data Visualization
147
  st.subheader("πŸ“ˆ Data Visualization")
148
 
149
+ # Heatmap
150
  st.write("### πŸ”₯ Feature Correlation")
151
  fig, ax = plt.subplots(figsize=(8, 5))
152
  sns.heatmap(df.corr(), annot=True, cmap="coolwarm", ax=ax)
153
  st.pyplot(fig)
154
 
155
+ # Feature Importance (for tree-based models)
156
+ if model in ["Random Forest", "Decision Tree"]:
157
+ feature_importances = model_instance.feature_importances_
158
+ feature_names = X.columns
159
+ importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
160
+
161
  st.write("### 🌟 Feature Importance")
162
  fig, ax = plt.subplots()
163
  sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], ax=ax)
164
  st.pyplot(fig)
165
 
166
+ # Show and Download Generated Code
167
+ generated_code = f"""
168
+ # AI Model Code
169
+ import pandas as pd
170
+ from sklearn.model_selection import train_test_split
171
+ from sklearn.preprocessing import StandardScaler
172
+ from {model_instance.__module__} import {model_instance.__class__.__name__}
173
+
174
+ # Load Data
175
+ df = pd.read_csv('{dataset_path}')
176
+ X = df.iloc[:, :-1]
177
+ y = df.iloc[:, -1]
178
+
179
+ # Train/Test Split
180
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
181
+
182
+ # Scaling
183
+ scaler = StandardScaler()
184
+ X_train = scaler.fit_transform(X_train)
185
+ X_test = scaler.transform(X_test)
186
+
187
+ # Train Model
188
+ model = {model_instance.__class__.__name__}()
189
+ model.fit(X_train, y_train)
190
+
191
+ # Predict
192
+ y_pred = model.predict(X_test)
193
+ print(y_pred)
194
+ """
195
+
196
+ st.subheader("πŸ“œ Generated Code")
197
+ st.code(generated_code, language="python")
198
+
199
+ # Download buttons
200
+ st.download_button("πŸ“₯ Download Python Script (.py)", generated_code, file_name="ai_model.py", mime="text/x-python")
201
+ st.download_button("πŸ“₯ Download Jupyter Notebook (.ipynb)", json.dumps({"cells": [{"cell_type": "code", "source": generated_code.split("\n"), "metadata": {}}], "metadata": {}, "nbformat": 4, "nbformat_minor": 2}), file_name="ai_model.ipynb", mime="application/json")
202
+
203
+ st.success("Code generated! Download and start using it! πŸš€")