import marimo __generated_with = "0.11.17" app = marimo.App(width="medium") @app.cell def _(mo): mo.md(r"""# Analyzing Colorectal Cancer Dataset""") return @app.cell def _(): import marimo as mo import polars as pl return mo, pl @app.cell def _(pl): dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv') # dataset.select("Tumor_Size_mm").describe() return (dataset,) @app.cell(hide_code=True) def _(dataset, pl): from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder ord_encoder = OrdinalEncoder() ord_encoded = ord_encoder.fit_transform(dataset.select('Early_Detection', 'Cancer_Stage', 'Survival_5_years')) encoded_features = ord_encoder.get_feature_names_out(['Early_Detection', 'Cancer_Stage', 'Survival_5_years']) encoded_schema = {name: pl.Int8 for name in encoded_features} # print(encoded_schema) dataset_encoded_parts = pl.DataFrame(ord_encoded, encoded_schema) dataset_encoded = dataset.with_columns(dataset_encoded_parts) # dataset_encoded return ( OneHotEncoder, OrdinalEncoder, dataset_encoded, dataset_encoded_parts, encoded_features, encoded_schema, ord_encoded, ord_encoder, ) @app.cell def _(dataset_encoded, mo): from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix X = dataset_encoded.select(['Tumor_Size_mm', 'Early_Detection', 'Cancer_Stage']) y = dataset_encoded.select(['Survival_5_years']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101) logreg = LogisticRegression() y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test) bnb = BernoulliNB() y_pred_bnb = bnb.fit(X_train, y_train).predict(X_test) dectree = DecisionTreeClassifier() y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test) mo.md(f""" ## Logistic Regression Accuracy score: {accuracy_score(y_test, y_pred_logreg)} Precision score: {precision_score(y_test, y_pred_logreg)} Confusion matrix: ``` {confusion_matrix(y_test, y_pred_logreg)} ``` Classification report: ``` {classification_report(y_test, y_pred_logreg)} ``` ## Bernoulli Naive Bayes Accuracy score: {accuracy_score(y_test, y_pred_bnb)} Precision score: {precision_score(y_test, y_pred_bnb)} Confusion matrix: ``` {confusion_matrix(y_test, y_pred_bnb)} ``` Classification report: ``` {classification_report(y_test, y_pred_bnb)} ``` ## Decision Tree Classifier Accuracy score: {accuracy_score(y_test, y_pred_dectree)} Precision score: {precision_score(y_test, y_pred_dectree)} Confusion matrix: ``` {confusion_matrix(y_test, y_pred_dectree)} ``` Classification report: ``` {classification_report(y_test, y_pred_dectree)} ``` {mo.callout("Classifiers don't work well with this dataset, let's try something else.", kind='info')} """) return ( BernoulliNB, DecisionTreeClassifier, LogisticRegression, X, X_test, X_train, accuracy_score, bnb, classification_report, confusion_matrix, dectree, logreg, precision_score, train_test_split, y, y_pred_bnb, y_pred_dectree, y_pred_logreg, y_test, y_train, ) @app.cell def _(OrdinalEncoder, dataset, mo, pl): def _(): from sklearn.cluster import KMeans, SpectralClustering, DBSCAN from sklearn.svm import SVC from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score import altair as alt genmut_encoder = OrdinalEncoder() genmut_encoded = genmut_encoder.fit_transform(dataset.select('Genetic_Mutation')) genmut_features = genmut_encoder.get_feature_names_out(['Genetic_Mutation']) encoded_schema = {name: pl.Int8 for name in genmut_features} dataset_encoded_parts = pl.DataFrame(genmut_encoded, encoded_schema) dataset_encoded = dataset.with_columns(dataset_encoded_parts) # Use samples since dataset is way too big to run locally dataset_encoded = dataset_encoded.sample(3000, seed=11) X = dataset_encoded.select(['Tumor_Size_mm', 'Genetic_Mutation']) y = dataset_encoded.select(['Cancer_Stage']).to_series() kmeans = KMeans(n_clusters=3, random_state=11) spec = SpectralClustering(n_clusters=3, random_state=11) labels_kmeans = kmeans.fit_predict(X) labels_spec = spec.fit_predict(X) # df_kmeans_parts = pl.DataFrame(labels_kmeans, schema=pl.String) df_kmeans = X.with_columns(pl.lit(labels_kmeans, dtype=pl.String).alias('kmeans_cluster')) df_spec = X.with_columns(pl.lit(labels_spec, dtype=pl.String).alias('spectral_cluster')) return mo.vstack([ mo.md(f""" ## K-Means Clustering ### External Metrics (Based on Cancer Stage Labels) Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_kmeans)} Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_kmeans)} Homogeneity: {homogeneity_score(y, labels_kmeans)} Completeness: {completeness_score(y, labels_kmeans)} V-measure: {v_measure_score(y, labels_kmeans)} ### Internal Metrics Silhouette Score: {silhouette_score(X, labels_kmeans)} Davies-Bouldin Index: {davies_bouldin_score(X, labels_kmeans)} Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_kmeans)} ## Spectral Clustering ### External Metrics (Based on Cancer Stage Labels) Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_spec)} Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_spec)} Homogeneity: {homogeneity_score(y, labels_spec)} Completeness: {completeness_score(y, labels_spec)} V-measure: {v_measure_score(y, labels_spec)} ### Internal Metrics Silhouette Score: {silhouette_score(X, labels_spec)} Davies-Bouldin Index: {davies_bouldin_score(X, labels_spec)} Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_spec)} {mo.callout("Unsupervised clustering techniques do perform reasonably well, but does not correlate to other labels.", 'info')} """), mo.hstack([ alt.Chart(df_kmeans, autosize='pad').mark_rect().encode( alt.X('Genetic_Mutation:N'), y='Tumor_Size_mm', color='kmeans_cluster' ).properties( width=325 ).interactive(), alt.Chart(df_spec, autosize='pad').mark_rect().encode( alt.X('Genetic_Mutation:N'), y='Tumor_Size_mm', color='spectral_cluster' ).properties( width=325 ).interactive(), ]) ]) _() return if __name__ == "__main__": app.run()