Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.tree import plot_tree, export_text | |
import seaborn as sns | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import GradientBoostingClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve | |
import shap | |
from xgboost import XGBClassifier | |
from lightgbm import LGBMClassifier | |
def load_data(): | |
data = pd.read_csv('exported_named_train_good.csv') | |
data_test = pd.read_csv('exported_named_test_good.csv') | |
X_train = data.drop("Target", axis=1) | |
y_train = data['Target'] | |
X_test = data_test.drop('Target', axis=1) | |
y_test = data_test['Target'] | |
return X_train, y_train, X_test, y_test, X_train.columns | |
def train_models(X_train, y_train, X_test, y_test): | |
models = { | |
"Logistic Regression": LogisticRegression(random_state=42), | |
"Decision Tree": DecisionTreeClassifier(random_state=42), | |
"Random Forest": RandomForestClassifier(random_state=42), | |
"Gradient Boost": GradientBoostingClassifier(random_state=42), | |
"Extreme Gradient Boosting": XGBClassifier(random_state=42), | |
"Light Gradient Boosting": LGBMClassifier(random_state=42) | |
} | |
results = {} | |
for name, model in models.items(): | |
model.fit(X_train, y_train) | |
# Predictions | |
y_train_pred = model.predict(X_train) | |
y_test_pred = model.predict(X_test) | |
# Metrics | |
results[name] = { | |
'model': model, | |
'train_metrics': { | |
'accuracy': accuracy_score(y_train, y_train_pred), | |
'f1': f1_score(y_train, y_train_pred, average='weighted'), | |
'precision': precision_score(y_train, y_train_pred), | |
'recall': recall_score(y_train, y_train_pred), | |
'roc_auc': roc_auc_score(y_train, y_train_pred) | |
}, | |
'test_metrics': { | |
'accuracy': accuracy_score(y_test, y_test_pred), | |
'f1': f1_score(y_test, y_test_pred, average='weighted'), | |
'precision': precision_score(y_test, y_test_pred), | |
'recall': recall_score(y_test, y_test_pred), | |
'roc_auc': roc_auc_score(y_test, y_test_pred) | |
} | |
} | |
return results | |
def plot_model_performance(results): | |
metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'] | |
fig, axes = plt.subplots(1, 2, figsize=(15, 6)) | |
# Training metrics | |
train_data = {model: [results[model]['train_metrics'][metric] for metric in metrics] | |
for model in results.keys()} | |
train_df = pd.DataFrame(train_data, index=metrics) | |
train_df.plot(kind='bar', ax=axes[0], title='Training Performance') | |
axes[0].set_ylim(0, 1) | |
# Test metrics | |
test_data = {model: [results[model]['test_metrics'][metric] for metric in metrics] | |
for model in results.keys()} | |
test_df = pd.DataFrame(test_data, index=metrics) | |
test_df.plot(kind='bar', ax=axes[1], title='Test Performance') | |
axes[1].set_ylim(0, 1) | |
plt.tight_layout() | |
return fig | |
def plot_feature_importance(model, feature_names, model_type): | |
plt.figure(figsize=(10, 6)) | |
if model_type in ["Decision Tree", "Random Forest", "Gradient Boost"]: | |
importance = model.feature_importances_ | |
elif model_type == "Logistic Regression": | |
importance = np.abs(model.coef_[0]) | |
importance_df = pd.DataFrame({ | |
'feature': feature_names, | |
'importance': importance | |
}).sort_values('importance', ascending=True) | |
plt.barh(importance_df['feature'], importance_df['importance']) | |
plt.title(f"Feature Importance - {model_type}") | |
return plt.gcf() | |
def app(): | |
st.title("Interpréteur de Modèles ML") | |
# Load data | |
X_train, y_train, X_test, y_test, feature_names = load_data() | |
# Train models if not in session state | |
if 'model_results' not in st.session_state: | |
with st.spinner("Entraînement des modèles en cours..."): | |
st.session_state.model_results = train_models(X_train, y_train, X_test, y_test) | |
# Sidebar | |
st.sidebar.title("Navigation") | |
selected_model = st.sidebar.selectbox( | |
"Sélectionnez un modèle", | |
list(st.session_state.model_results.keys()) | |
) | |
page = st.sidebar.radio( | |
"Sélectionnez une section", | |
["Performance des modèles", | |
"Interprétation du modèle", | |
"Analyse des caractéristiques", | |
"Simulateur de prédictions"] | |
) | |
current_model = st.session_state.model_results[selected_model]['model'] | |
# Performance des modèles | |
if page == "Performance des modèles": | |
st.header("Performance des modèles") | |
# Plot global performance comparison | |
st.subheader("Comparaison des performances") | |
performance_fig = plot_model_performance(st.session_state.model_results) | |
st.pyplot(performance_fig) | |
# Detailed metrics for selected model | |
st.subheader(f"Métriques détaillées - {selected_model}") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("Métriques d'entraînement:") | |
for metric, value in st.session_state.model_results[selected_model]['train_metrics'].items(): | |
st.write(f"{metric}: {value:.4f}") | |
with col2: | |
st.write("Métriques de test:") | |
for metric, value in st.session_state.model_results[selected_model]['test_metrics'].items(): | |
st.write(f"{metric}: {value:.4f}") | |
# Interprétation du modèle | |
elif page == "Interprétation du modèle": | |
st.header(f"Interprétation du modèle - {selected_model}") | |
if selected_model in ["Decision Tree", "Random Forest"]: | |
if selected_model == "Decision Tree": | |
st.subheader("Visualisation de l'arbre") | |
max_depth = st.slider("Profondeur maximale à afficher", 1, 5, 3) | |
fig, ax = plt.subplots(figsize=(20, 10)) | |
plot_tree(current_model, feature_names=list(feature_names), | |
max_depth=max_depth, filled=True, rounded=True) | |
st.pyplot(fig) | |
st.subheader("Règles de décision importantes") | |
if selected_model == "Decision Tree": | |
st.text(export_text(current_model, feature_names=list(feature_names))) | |
# SHAP values for all models | |
st.subheader("SHAP Values") | |
with st.spinner("Calcul des valeurs SHAP en cours..."): | |
explainer = shap.TreeExplainer(current_model) if selected_model != "Logistic Regression" \ | |
else shap.LinearExplainer(current_model, X_train) | |
shap_values = explainer.shap_values(X_train[:100]) # Using first 100 samples for speed | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
shap.summary_plot(shap_values, X_train[:100], feature_names=list(feature_names), | |
show=False) | |
st.pyplot(fig) | |
# Analyse des caractéristiques | |
elif page == "Analyse des caractéristiques": | |
st.header("Analyse des caractéristiques") | |
# Feature importance | |
st.subheader("Importance des caractéristiques") | |
importance_fig = plot_feature_importance(current_model, feature_names, selected_model) | |
st.pyplot(importance_fig) | |
# Feature correlation | |
st.subheader("Matrice de corrélation") | |
corr_matrix = X_train.corr() | |
fig, ax = plt.subplots(figsize=(10, 8)) | |
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0) | |
st.pyplot(fig) | |
# Simulateur de prédictions | |
else: | |
st.header("Simulateur de prédictions") | |
input_values = {} | |
for feature in feature_names: | |
if X_train[feature].dtype == 'object': | |
input_values[feature] = st.selectbox( | |
f"Sélectionnez {feature}", | |
options=X_train[feature].unique() | |
) | |
else: | |
input_values[feature] = st.slider( | |
f"Valeur pour {feature}", | |
float(X_train[feature].min()), | |
float(X_train[feature].max()), | |
float(X_train[feature].mean()) | |
) | |
if st.button("Prédire"): | |
input_df = pd.DataFrame([input_values]) | |
prediction = current_model.predict_proba(input_df) | |
st.write("Probabilités prédites:") | |
st.write({f"Classe {i}": f"{prob:.2%}" for i, prob in enumerate(prediction[0])}) | |
if selected_model == "Decision Tree": | |
st.subheader("Chemin de décision") | |
node_indicator = current_model.decision_path(input_df) | |
leaf_id = current_model.apply(input_df) | |
node_index = node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]] | |
rules = [] | |
for node_id in node_index: | |
if node_id != leaf_id[0]: | |
threshold = current_model.tree_.threshold[node_id] | |
feature = feature_names[current_model.tree_.feature[node_id]] | |
if input_df.iloc[0][feature] <= threshold: | |
rules.append(f"{feature} ≤ {threshold:.2f}") | |
else: | |
rules.append(f"{feature} > {threshold:.2f}") | |
for rule in rules: | |
st.write(rule) | |
if __name__ == "__main__": | |
app() |