Travel.Com / app.py
analist's picture
Update app.py
93fbd34 verified
raw
history blame
10 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import shap
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
def load_data():
data = pd.read_csv('exported_named_train_good.csv')
data_test = pd.read_csv('exported_named_test_good.csv')
X_train = data.drop("Target", axis=1)
y_train = data['Target']
X_test = data_test.drop('Target', axis=1)
y_test = data_test['Target']
return X_train, y_train, X_test, y_test, X_train.columns
def train_models(X_train, y_train, X_test, y_test):
models = {
"Logistic Regression": LogisticRegression(random_state=42),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(random_state=42),
"Gradient Boost": GradientBoostingClassifier(random_state=42),
"Extreme Gradient Boosting": XGBClassifier(random_state=42),
"Light Gradient Boosting": LGBMClassifier(random_state=42)
}
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Metrics
results[name] = {
'model': model,
'train_metrics': {
'accuracy': accuracy_score(y_train, y_train_pred),
'f1': f1_score(y_train, y_train_pred, average='weighted'),
'precision': precision_score(y_train, y_train_pred),
'recall': recall_score(y_train, y_train_pred),
'roc_auc': roc_auc_score(y_train, y_train_pred)
},
'test_metrics': {
'accuracy': accuracy_score(y_test, y_test_pred),
'f1': f1_score(y_test, y_test_pred, average='weighted'),
'precision': precision_score(y_test, y_test_pred),
'recall': recall_score(y_test, y_test_pred),
'roc_auc': roc_auc_score(y_test, y_test_pred)
}
}
return results
def plot_model_performance(results):
metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Training metrics
train_data = {model: [results[model]['train_metrics'][metric] for metric in metrics]
for model in results.keys()}
train_df = pd.DataFrame(train_data, index=metrics)
train_df.plot(kind='bar', ax=axes[0], title='Training Performance')
axes[0].set_ylim(0, 1)
# Test metrics
test_data = {model: [results[model]['test_metrics'][metric] for metric in metrics]
for model in results.keys()}
test_df = pd.DataFrame(test_data, index=metrics)
test_df.plot(kind='bar', ax=axes[1], title='Test Performance')
axes[1].set_ylim(0, 1)
plt.tight_layout()
return fig
def plot_feature_importance(model, feature_names, model_type):
plt.figure(figsize=(10, 6))
if model_type in ["Decision Tree", "Random Forest", "Gradient Boost"]:
importance = model.feature_importances_
elif model_type == "Logistic Regression":
importance = np.abs(model.coef_[0])
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importance
}).sort_values('importance', ascending=True)
plt.barh(importance_df['feature'], importance_df['importance'])
plt.title(f"Feature Importance - {model_type}")
return plt.gcf()
def app():
st.title("Interpréteur de Modèles ML")
# Load data
X_train, y_train, X_test, y_test, feature_names = load_data()
# Train models if not in session state
if 'model_results' not in st.session_state:
with st.spinner("Entraînement des modèles en cours..."):
st.session_state.model_results = train_models(X_train, y_train, X_test, y_test)
# Sidebar
st.sidebar.title("Navigation")
selected_model = st.sidebar.selectbox(
"Sélectionnez un modèle",
list(st.session_state.model_results.keys())
)
page = st.sidebar.radio(
"Sélectionnez une section",
["Performance des modèles",
"Interprétation du modèle",
"Analyse des caractéristiques",
"Simulateur de prédictions"]
)
current_model = st.session_state.model_results[selected_model]['model']
# Performance des modèles
if page == "Performance des modèles":
st.header("Performance des modèles")
# Plot global performance comparison
st.subheader("Comparaison des performances")
performance_fig = plot_model_performance(st.session_state.model_results)
st.pyplot(performance_fig)
# Detailed metrics for selected model
st.subheader(f"Métriques détaillées - {selected_model}")
col1, col2 = st.columns(2)
with col1:
st.write("Métriques d'entraînement:")
for metric, value in st.session_state.model_results[selected_model]['train_metrics'].items():
st.write(f"{metric}: {value:.4f}")
with col2:
st.write("Métriques de test:")
for metric, value in st.session_state.model_results[selected_model]['test_metrics'].items():
st.write(f"{metric}: {value:.4f}")
# Interprétation du modèle
elif page == "Interprétation du modèle":
st.header(f"Interprétation du modèle - {selected_model}")
if selected_model in ["Decision Tree", "Random Forest"]:
if selected_model == "Decision Tree":
st.subheader("Visualisation de l'arbre")
max_depth = st.slider("Profondeur maximale à afficher", 1, 5, 3)
fig, ax = plt.subplots(figsize=(20, 10))
plot_tree(current_model, feature_names=list(feature_names),
max_depth=max_depth, filled=True, rounded=True)
st.pyplot(fig)
st.subheader("Règles de décision importantes")
if selected_model == "Decision Tree":
st.text(export_text(current_model, feature_names=list(feature_names)))
# SHAP values for all models
st.subheader("SHAP Values")
with st.spinner("Calcul des valeurs SHAP en cours..."):
explainer = shap.TreeExplainer(current_model) if selected_model != "Logistic Regression" \
else shap.LinearExplainer(current_model, X_train)
shap_values = explainer.shap_values(X_train[:100]) # Using first 100 samples for speed
fig, ax = plt.subplots(figsize=(10, 6))
shap.summary_plot(shap_values, X_train[:100], feature_names=list(feature_names),
show=False)
st.pyplot(fig)
# Analyse des caractéristiques
elif page == "Analyse des caractéristiques":
st.header("Analyse des caractéristiques")
# Feature importance
st.subheader("Importance des caractéristiques")
importance_fig = plot_feature_importance(current_model, feature_names, selected_model)
st.pyplot(importance_fig)
# Feature correlation
st.subheader("Matrice de corrélation")
corr_matrix = X_train.corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
st.pyplot(fig)
# Simulateur de prédictions
else:
st.header("Simulateur de prédictions")
input_values = {}
for feature in feature_names:
if X_train[feature].dtype == 'object':
input_values[feature] = st.selectbox(
f"Sélectionnez {feature}",
options=X_train[feature].unique()
)
else:
input_values[feature] = st.slider(
f"Valeur pour {feature}",
float(X_train[feature].min()),
float(X_train[feature].max()),
float(X_train[feature].mean())
)
if st.button("Prédire"):
input_df = pd.DataFrame([input_values])
prediction = current_model.predict_proba(input_df)
st.write("Probabilités prédites:")
st.write({f"Classe {i}": f"{prob:.2%}" for i, prob in enumerate(prediction[0])})
if selected_model == "Decision Tree":
st.subheader("Chemin de décision")
node_indicator = current_model.decision_path(input_df)
leaf_id = current_model.apply(input_df)
node_index = node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]]
rules = []
for node_id in node_index:
if node_id != leaf_id[0]:
threshold = current_model.tree_.threshold[node_id]
feature = feature_names[current_model.tree_.feature[node_id]]
if input_df.iloc[0][feature] <= threshold:
rules.append(f"{feature}{threshold:.2f}")
else:
rules.append(f"{feature} > {threshold:.2f}")
for rule in rules:
st.write(rule)
if __name__ == "__main__":
app()