Surbhi commited on
Commit
1960a99
Β·
1 Parent(s): b2fd176

Feature extraction and model training

Browse files
app.py CHANGED
@@ -1,112 +1,93 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- import joblib
5
- import textwrap
6
-
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.preprocessing import StandardScaler, LabelEncoder
 
9
  from sklearn.impute import SimpleImputer
10
- from sklearn.feature_selection import SelectKBest, f_classif, f_regression
11
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
12
  from imblearn.over_sampling import SMOTE
 
13
 
14
- # Streamlit UI
15
- st.title("πŸš€ AI Code Generator")
16
- st.markdown("Generate & Train ML Models with Preprocessing and Feature Selection")
 
 
 
17
 
18
  # Sidebar UI
19
- st.sidebar.title("Choose Options")
 
 
 
20
  model_options = ["KNN", "SVM", "Random Forest", "Decision Tree", "Perceptron"]
21
  model = st.sidebar.selectbox("Choose a Model:", model_options)
22
 
 
23
  task_options = ["Classification", "Regression"]
24
  task = st.sidebar.selectbox("Choose a Task:", task_options)
25
 
26
- # Load Dataset
27
- st.markdown("### Upload your Dataset (CSV)")
28
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
29
-
30
- if uploaded_file:
31
- data = pd.read_csv(uploaded_file)
32
- st.write("Preview of Dataset:", data.head())
33
-
34
- # Preprocessing Steps
35
- st.markdown("### Data Preprocessing Steps")
36
-
37
- # Handling Missing Values
38
- st.write("βœ… Handling missing values using `SimpleImputer`")
39
- imputer = SimpleImputer(strategy="mean")
40
- data.fillna(data.mean(), inplace=True)
41
-
42
- # Encoding Categorical Variables
43
- st.write("βœ… Encoding categorical variables")
44
- for col in data.select_dtypes(include=["object"]).columns:
45
- data[col] = LabelEncoder().fit_transform(data[col])
46
-
47
- # Splitting Data
48
- X = data.iloc[:, :-1] # Features
49
- y = data.iloc[:, -1] # Target
50
-
51
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
52
-
53
- # Feature Scaling
54
- st.write("βœ… Applying StandardScaler")
55
- scaler = StandardScaler()
56
- X_train = scaler.fit_transform(X_train)
57
- X_test = scaler.transform(X_test)
58
-
59
- # Handle Imbalanced Dataset using SMOTE
60
- if task == "Classification":
61
- st.write("βœ… Handling Imbalanced Dataset using SMOTE")
62
- smote = SMOTE()
63
- X_train, y_train = smote.fit_resample(X_train, y_train)
64
-
65
- # Feature Selection
66
- st.write("βœ… Selecting Best Features")
67
- selector = SelectKBest(f_classif if task == "Classification" else f_regression, k=min(5, X.shape[1]))
68
- X_train = selector.fit_transform(X_train, y_train)
69
- X_test = selector.transform(X_test)
70
-
71
- # Model Training
72
- model_mapping = {
73
- "KNN": "KNeighborsClassifier" if task == "Classification" else "KNeighborsRegressor",
74
- "SVM": "SVC" if task == "Classification" else "SVR",
75
- "Random Forest": "RandomForestClassifier" if task == "Classification" else "RandomForestRegressor",
76
- "Decision Tree": "DecisionTreeClassifier" if task == "Classification" else "DecisionTreeRegressor",
77
- "Perceptron": "Perceptron" if task == "Classification" else "Perceptron"
78
  }
79
-
80
- model_class = model_mapping[model]
81
-
82
- template = f"""
83
- import numpy as np
84
- import pandas as pd
85
- import joblib
86
-
87
- from sklearn.model_selection import train_test_split
88
- from sklearn.preprocessing import StandardScaler, LabelEncoder
89
- from sklearn.impute import SimpleImputer
90
- from sklearn.feature_selection import SelectKBest, f_classif, f_regression
91
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
92
- from imblearn.over_sampling import SMOTE
93
- from sklearn.{model.lower()} import {model_class}
94
-
95
- # Load Dataset
96
- data = pd.read_csv('dataset.csv')
97
-
98
- # Handling Missing Values
99
- imputer = SimpleImputer(strategy="mean")
100
- data.fillna(data.mean(), inplace=True)
101
-
102
- # Encoding Categorical Variables
103
- for col in data.select_dtypes(include=["object"]).columns:
104
- data[col] = LabelEncoder().fit_transform(data[col])
105
-
106
- # Splitting Data
107
- X = data.iloc[:, :-1]
108
- y = data.iloc[:, -1]
109
-
 
 
 
 
 
 
 
 
 
 
 
 
110
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
111
 
112
  # Feature Scaling
@@ -114,43 +95,50 @@ scaler = StandardScaler()
114
  X_train = scaler.fit_transform(X_train)
115
  X_test = scaler.transform(X_test)
116
 
117
- # Handle Imbalanced Data (SMOTE)
118
- if "{task}" == "Classification":
119
- smote = SMOTE()
120
- X_train, y_train = smote.fit_resample(X_train, y_train)
121
-
122
  # Feature Selection
123
- selector = SelectKBest(f_classif if "{task}" == "Classification" else f_regression, k=min(5, X.shape[1]))
124
  X_train = selector.fit_transform(X_train, y_train)
125
  X_test = selector.transform(X_test)
126
 
127
- # Model Training
128
- model = {model_class}()
129
- model.fit(X_train, y_train)
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- # Save Trained Model
132
- joblib.dump(model, 'models/trained_model.pkl')
 
133
 
134
  # Evaluation Metrics
135
- if "{task}" == "Classification":
136
- y_pred = model.predict(X_test)
137
- print("Accuracy:", accuracy_score(y_test, y_pred))
138
- print("Precision:", precision_score(y_test, y_pred, average='weighted'))
139
- print("Recall:", recall_score(y_test, y_pred, average='weighted'))
140
- print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
141
  else:
142
- y_pred = model.predict(X_test)
143
- print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
144
- print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
145
- print("R2 Score:", r2_score(y_test, y_pred))
146
- """
147
 
148
- st.code(template, language="python")
149
- st.download_button("πŸ“₯ Download AI Model Code", template, "ai_model.py")
 
 
 
150
 
151
- # Save Model
152
- model_instance = eval(model_class)()
153
- model_instance.fit(X_train, y_train)
154
- joblib.dump(model_instance, "models/trained_model.pkl")
155
 
156
- st.success("βœ… Model trained and saved as `trained_model.pkl`")
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
 
6
  from sklearn.model_selection import train_test_split
7
  from sklearn.preprocessing import StandardScaler, LabelEncoder
8
+ from sklearn.feature_selection import SelectKBest, f_classif
9
  from sklearn.impute import SimpleImputer
 
 
10
  from imblearn.over_sampling import SMOTE
11
+ from sklearn.metrics import accuracy_score, classification_report
12
 
13
+ # Import ML Models
14
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
15
+ from sklearn.svm import SVC, SVR
16
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
17
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
18
+ from sklearn.linear_model import Perceptron
19
 
20
  # Sidebar UI
21
+ st.sidebar.title("AI Code Generator 🧠")
22
+ st.sidebar.markdown("Generate AI models instantly!")
23
+
24
+ # Model Selection
25
  model_options = ["KNN", "SVM", "Random Forest", "Decision Tree", "Perceptron"]
26
  model = st.sidebar.selectbox("Choose a Model:", model_options)
27
 
28
+ # Task Selection
29
  task_options = ["Classification", "Regression"]
30
  task = st.sidebar.selectbox("Choose a Task:", task_options)
31
 
32
+ # Problem Selection based on Task and Model
33
+ problems = {
34
+ "Classification": {
35
+ "KNN": ["Spam Detection", "Disease Prediction"],
36
+ "SVM": ["Image Recognition", "Text Classification"],
37
+ "Random Forest": ["Fraud Detection", "Customer Segmentation"],
38
+ "Decision Tree": ["Loan Approval", "Churn Prediction"],
39
+ "Perceptron": ["Handwritten Digit Recognition", "Sentiment Analysis"]
40
+ },
41
+ "Regression": {
42
+ "KNN": ["House Price Prediction", "Stock Prediction"],
43
+ "SVM": ["Sales Forecasting", "Stock Market Trends"],
44
+ "Random Forest": ["Energy Consumption", "Patient Survival Prediction"],
45
+ "Decision Tree": ["House Price Estimation", "Revenue Prediction"],
46
+ "Perceptron": ["Weather Forecasting", "Traffic Flow Prediction"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  }
48
+ }
49
+
50
+ problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
51
+
52
+ # Dataset Selection (User selects a pre-existing fake dataset)
53
+ dataset_mapping = {
54
+ "Spam Detection": "datasets/spam_detection.csv",
55
+ "Disease Prediction": "datasets/disease_prediction.csv",
56
+ "Fraud Detection": "datasets/fraud_detection.csv",
57
+ "House Price Prediction": "datasets/house_price.csv",
58
+ "Sales Forecasting": "datasets/sales_forecasting.csv",
59
+ }
60
+
61
+ dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
62
+ df = pd.read_csv(dataset_path)
63
+
64
+ # Display dataset
65
+ st.subheader("Sample Dataset")
66
+ st.write(df.head())
67
+
68
+ # Preprocessing Steps
69
+ st.subheader("πŸ“Œ Preprocessing Steps")
70
+ st.markdown("""
71
+ - βœ… Handle Missing Values
72
+ - βœ… Encoding Categorical Variables
73
+ - βœ… Feature Scaling
74
+ - βœ… Feature Selection
75
+ - βœ… Handling Imbalanced Data using **SMOTE**
76
+ """)
77
+
78
+ # Handle missing values
79
+ imputer = SimpleImputer(strategy='mean')
80
+ df = df.apply(lambda col: imputer.fit_transform(col.values.reshape(-1, 1)).flatten() if col.dtypes == 'float64' else col)
81
+
82
+ # Encoding categorical variables
83
+ label_encoders = {}
84
+ for col in df.select_dtypes(include=['object']).columns:
85
+ label_encoders[col] = LabelEncoder()
86
+ df[col] = label_encoders[col].fit_transform(df[col])
87
+
88
+ # Split Data
89
+ X = df.iloc[:, :-1] # Features
90
+ y = df.iloc[:, -1] # Target
91
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
92
 
93
  # Feature Scaling
 
95
  X_train = scaler.fit_transform(X_train)
96
  X_test = scaler.transform(X_test)
97
 
 
 
 
 
 
98
  # Feature Selection
99
+ selector = SelectKBest(score_func=f_classif, k=5)
100
  X_train = selector.fit_transform(X_train, y_train)
101
  X_test = selector.transform(X_test)
102
 
103
+ # Handle imbalanced data
104
+ if task == "Classification":
105
+ smote = SMOTE()
106
+ X_train, y_train = smote.fit_resample(X_train, y_train)
107
+
108
+ # Model Initialization
109
+ model_mapping = {
110
+ "KNN": KNeighborsClassifier() if task == "Classification" else KNeighborsRegressor(),
111
+ "SVM": SVC() if task == "Classification" else SVR(),
112
+ "Random Forest": RandomForestClassifier() if task == "Classification" else RandomForestRegressor(),
113
+ "Decision Tree": DecisionTreeClassifier() if task == "Classification" else DecisionTreeRegressor(),
114
+ "Perceptron": Perceptron() if task == "Classification" else Perceptron()
115
+ }
116
+
117
+ model_instance = model_mapping[model]
118
 
119
+ # Train Model
120
+ model_instance.fit(X_train, y_train)
121
+ y_pred = model_instance.predict(X_test)
122
 
123
  # Evaluation Metrics
124
+ st.subheader("πŸ“Š Model Evaluation")
125
+ if task == "Classification":
126
+ accuracy = accuracy_score(y_test, y_pred)
127
+ report = classification_report(y_test, y_pred)
128
+ st.write(f"**Accuracy:** {accuracy:.2f}")
129
+ st.text(report)
130
  else:
131
+ st.write("Regression evaluation metrics will be added soon!")
 
 
 
 
132
 
133
+ # Visualization
134
+ st.subheader("πŸ“ˆ Data Visualization")
135
+ plt.figure(figsize=(8, 5))
136
+ sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
137
+ st.pyplot(plt)
138
 
139
+ # Download Code
140
+ st.download_button("🐍 Download Python Code (.py)", "ai_model.py")
141
+ st.download_button("πŸ““ Download Notebook (.ipynb)", "ai_model.ipynb")
142
+ st.markdown("[πŸš€ Open in Colab](https://colab.research.google.com/)")
143
 
144
+ st.success("Code generated! Download and do magic! ✨")
dataset.csv DELETED
@@ -1,14 +0,0 @@
1
- # Fake dataset for AI Code Generator
2
- # You can replace this with your own dataset
3
-
4
- feature1,feature2,feature3,feature4,target
5
- 34,180,1,50000,0
6
- 25,165,0,60000,1
7
- 40,175,1,55000,0
8
- 30,170,0,62000,1
9
- 45,185,1,58000,0
10
- 28,160,0,57000,1
11
- 35,178,1,53000,0
12
- 50,190,1,49000,1
13
- 23,158,0,61000,0
14
- 38,172,1,56000,1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
datasets/disease_prediction.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fever,cough,fatigue,disease
2
+ 98.6,0,0,"Healthy"
3
+ 100.2,1,1,"Flu"
4
+ 101.5,1,0,"COVID-19"
datasets/fraud_detection.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transaction_amount,transaction_type,location,is_fraud
2
+ 500,Credit Card,New York,0
3
+ 1200,Wire Transfer,California,1
4
+ 250,Debit Card,Texas,0
5
+ 800,Online Purchase,Florida,1
6
+ 50,Cash Withdrawal,Illinois,0
datasets/house_price.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ area_sqft,bedrooms,bathrooms,location,price
2
+ 1200,3,2,New York,350000
3
+ 1800,4,3,California,500000
4
+ 950,2,1,Texas,200000
5
+ 2200,5,4,Florida,600000
6
+ 1100,3,2,Illinois,300000
datasets/sales_forecasting.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ month,product,units_sold,revenue
2
+ January,Product A,150,4500
3
+ February,Product A,200,6000
4
+ March,Product B,180,5400
5
+ April,Product C,250,7500
6
+ May,Product B,220,6600
datasets/spam_detection.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ email_text,is_spam
2
+ "Congratulations! You won a lottery",1
3
+ "Important update on your bank account",1
4
+ "Meeting tomorrow at 10 AM",0
5
+ "Get your free trial now!",1
6
+ "Project submission deadline extended",0
models/trained_model.pkl DELETED
File without changes
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  streamlit
2
  pandas
3
  numpy
 
 
4
  scikit-learn
5
- joblib
6
  imbalanced-learn
 
1
  streamlit
2
  pandas
3
  numpy
4
+ matplotlib
5
+ seaborn
6
  scikit-learn
 
7
  imbalanced-learn