import marimo __generated_with = "0.11.4" app = marimo.App(width="medium") @app.cell(hide_code=True) def _(mo): mo.md(r"""# Diabetes Dataset Analysis""") return @app.cell(hide_code=True) def _(): import marimo as mo import polars as pl return mo, pl @app.cell(hide_code=True) def _(mo): mo.accordion( {"Notes": """ ## Dataset Column Notes > Only highlighted columns of interest * Diabetes_binary: [ 0 (No diabetes) | 1 (Pre/diabetes) ] * HighBP: [ 0 (No High BP) | 1 (High BP) ] * HighChol: [ 0 (No High BP) | 1 (High BP) ] * Stroke: [ 0 (Never) | 1 (Had) ] * HeartDiseaseorAttack: [ 0 (No) | 1 (Yes) ] * Smoker: [ 0 (<100 cigs lifetime) | 1 (>100 cigs lifetime) * HvyAlcohol: [ 0 (<14 🍺/week for men, <7 🍺/week for women) | 1 (otherwise) ] """} ) return @app.cell(hide_code=True) def _(pl): dataset_raw = pl.read_csv("dataset/diabetes_binary_health_indicators_BRFSS2015.csv") dataset_prior_conditions = dataset_raw.select(["Diabetes_binary", "HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"]) dataset_prior_conditions.head() return dataset_prior_conditions, dataset_raw @app.cell def _(mo): mo.md("""## Naive Bayes' Classifier""") return @app.cell def _(dataset_prior_conditions, mo, pl): from sklearn.naive_bayes import BernoulliNB from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report, confusion_matrix X_priors_NB, y_priors_NB = dataset_prior_conditions.select(pl.exclude("Diabetes_binary")), dataset_prior_conditions.select("Diabetes_binary") X_train_priors, X_test_priors, y_train_priors, y_test_priors = train_test_split( X_priors_NB, y_priors_NB, random_state=33, test_size=0.25 ) bnb = BernoulliNB() y_pred_priors = bnb.fit(X_train_priors, y_train_priors).predict(X_test_priors) mo.md(f""" Accuracy : {accuracy_score(y_test_priors, y_pred_priors)} Confusion Matrix: ``` {confusion_matrix(y_test_priors, y_pred_priors)} ``` Classification Report: ``` {classification_report(y_test_priors, y_pred_priors)} ``` """) return ( BernoulliNB, X_priors_NB, X_test_priors, X_train_priors, accuracy_score, bnb, classification_report, confusion_matrix, train_test_split, y_pred_priors, y_priors_NB, y_test_priors, y_train_priors, ) @app.cell def _(X_test_priors, pl, y_pred_priors, y_test_priors): import altair as alt alt.data_transformers.enable("vegafusion") # X_test_priors, y_pred_priors, y_test_priors dataset_result_priors = pl.concat([X_test_priors, y_test_priors, pl.DataFrame({"Predicted Diabetes_binary": y_pred_priors})], how="horizontal") dataset_result_priors1 = dataset_result_priors.select( (pl.col("HighBP") * 8), (pl.col("HighChol") * 4), (pl.col("Stroke") * 2), pl.exclude(["HighBP", "HighChol", "Stroke"]) ) dataset_result_priors1 = dataset_result_priors1.select( pl.sum_horizontal(pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack")), pl.col("Diabetes_binary", "Predicted Diabetes_binary") ) dataset_result_priors2 = dataset_result_priors.select( pl.exclude(["Diabetes_binary", "Predicted Diabetes_binary"]), (pl.col("Diabetes_binary") * 2), pl.col("Predicted Diabetes_binary") ) dataset_result_priors2 = dataset_result_priors2.select( pl.col("HighBP", "HighChol", "Stroke", "HeartDiseaseorAttack"), pl.sum_horizontal(pl.col("Diabetes_binary", "Predicted Diabetes_binary")) ) dataset_result_priors2.head(10) return ( alt, dataset_result_priors, dataset_result_priors1, dataset_result_priors2, ) @app.cell def _(): return if __name__ == "__main__": app.run()