Surbhi
Feature extraction and model training
a15c8e6
raw
history blame
5.76 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import Perceptron
# Sidebar UI
st.sidebar.title("AI Code Generator 🧠")
st.sidebar.markdown("Generate AI models instantly!")
# Model Selection
model_options = ["KNN", "SVM", "Random Forest", "Decision Tree", "Perceptron"]
model = st.sidebar.selectbox("Choose a Model:", model_options)
# Task Selection
task_options = ["Classification", "Regression"]
task = st.sidebar.selectbox("Choose a Task:", task_options)
# Problem Selection based on Task and Model
problems = {
"Classification": {
"KNN": ["Spam Detection", "Disease Prediction"],
"SVM": ["Image Recognition", "Text Classification"],
"Random Forest": ["Fraud Detection", "Customer Segmentation"],
"Decision Tree": ["Loan Approval", "Churn Prediction"],
"Perceptron": ["Handwritten Digit Recognition", "Sentiment Analysis"]
},
"Regression": {
"KNN": ["House Price Prediction", "Stock Prediction"],
"SVM": ["Sales Forecasting", "Stock Market Trends"],
"Random Forest": ["Energy Consumption", "Patient Survival Prediction"],
"Decision Tree": ["House Price Estimation", "Revenue Prediction"],
"Perceptron": ["Weather Forecasting", "Traffic Flow Prediction"]
}
}
problem = st.sidebar.selectbox("Choose a Problem:", problems[task][model])
# Dataset Mapping (Dynamic)
dataset_mapping = {name: f"datasets/{name.lower().replace(' ', '_')}.csv"
for sublist in problems.values() for model in sublist for name in sublist[model]}
dataset_path = dataset_mapping.get(problem, "datasets/spam_detection.csv")
df = pd.read_csv(dataset_path)
# Display dataset
st.subheader("Sample Dataset")
st.write(df.head())
# Preprocessing Steps
st.subheader("πŸ“Œ Preprocessing Steps")
st.markdown("""
- βœ… Handle Missing Values
- βœ… Encoding Categorical Variables
- βœ… Feature Scaling
- βœ… Feature Selection
- βœ… Handling Imbalanced Data using **SMOTE**
""")
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df = df.apply(lambda col: imputer.fit_transform(col.values.reshape(-1, 1)).flatten() if col.dtypes == 'float64' else col)
# Encoding categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
label_encoders[col] = LabelEncoder()
df[col] = label_encoders[col].fit_transform(df[col])
# Split Data
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Feature Selection
selector = SelectKBest(score_func=f_classif, k=min(X.shape[1], 5))
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)
# Handle imbalanced data
if task == "Classification":
class_counts = Counter(y_train)
min_class_samples = min(class_counts.values())
if min_class_samples > 5:
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)
else:
ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(X_train, y_train)
# Model Initialization
n_neighbors = min(5, len(y_train))
model_mapping = {
"KNN": KNeighborsClassifier(n_neighbors=n_neighbors) if task == "Classification" else KNeighborsRegressor(n_neighbors=n_neighbors),
"SVM": SVC() if task == "Classification" else SVR(),
"Random Forest": RandomForestClassifier() if task == "Classification" else RandomForestRegressor(),
"Decision Tree": DecisionTreeClassifier() if task == "Classification" else DecisionTreeRegressor(),
"Perceptron": Perceptron()
}
model_instance = model_mapping[model]
# Train Model
model_instance.fit(X_train, y_train)
y_pred = model_instance.predict(X_test)
# Model Evaluation
st.subheader("πŸ“Š Model Evaluation")
if task == "Classification":
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
st.write(f"**Accuracy:** {accuracy:.2f}")
st.json(report)
else:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
st.write(f"**MSE:** {mse:.4f}, **MAE:** {mae:.4f}, **RΒ² Score:** {r2:.4f}")
# Data Visualization
st.subheader("πŸ“ˆ Data Visualization")
st.write("### πŸ”₯ Feature Correlation")
fig, ax = plt.subplots(figsize=(8, 5))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", ax=ax)
st.pyplot(fig)
if model in ["Random Forest", "Decision Tree"] and hasattr(model_instance, "feature_importances_"):
importance_df = pd.DataFrame({"Feature": X.columns, "Importance": model_instance.feature_importances_}).sort_values(by="Importance", ascending=False)
st.write("### 🌟 Feature Importance")
fig, ax = plt.subplots()
sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], ax=ax)
st.pyplot(fig)
st.success("Code generated! πŸš€ Download & run!")