Commit
·
b8863d9
1
Parent(s):
689e49e
Add unsupervised clustering models.
Browse files- .dockerignore +1 -0
- app.py +115 -24
.dockerignore
CHANGED
@@ -2,3 +2,4 @@
|
|
2 |
./dataset/diabetes_012_health_indicators_BRFSS2015.csv
|
3 |
./dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
|
4 |
./dataset/final_phone_preferences_india.csv
|
|
|
|
2 |
./dataset/diabetes_012_health_indicators_BRFSS2015.csv
|
3 |
./dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
|
4 |
./dataset/final_phone_preferences_india.csv
|
5 |
+
./__pycache__/
|
app.py
CHANGED
@@ -4,6 +4,12 @@ __generated_with = "0.11.17"
|
|
4 |
app = marimo.App(width="medium")
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
@app.cell
|
8 |
def _():
|
9 |
import marimo as mo
|
@@ -14,54 +20,45 @@ def _():
|
|
14 |
@app.cell
|
15 |
def _(pl):
|
16 |
dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv')
|
17 |
-
dataset
|
18 |
return (dataset,)
|
19 |
|
20 |
|
21 |
-
@app.cell
|
22 |
def _(dataset, pl):
|
23 |
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
24 |
|
25 |
-
encoder = OneHotEncoder(sparse_output=False)
|
26 |
ord_encoder = OrdinalEncoder()
|
27 |
-
|
28 |
-
|
29 |
-
encoded_features = encoder.get_feature_names_out(['Obesity_BMI', 'Cancer_Stage'])
|
30 |
-
ord_encoded_features = ord_encoder.get_feature_names_out(['Survival_5_years'])
|
31 |
encoded_schema = {name: pl.Int8 for name in encoded_features}
|
32 |
-
|
33 |
-
dataset_encoded_parts = pl.DataFrame(
|
34 |
-
|
35 |
-
dataset_encoded
|
36 |
return (
|
37 |
OneHotEncoder,
|
38 |
OrdinalEncoder,
|
39 |
dataset_encoded,
|
40 |
dataset_encoded_parts,
|
41 |
-
dataset_ord_encoded_parts,
|
42 |
-
encoded,
|
43 |
encoded_features,
|
44 |
encoded_schema,
|
45 |
-
encoder,
|
46 |
ord_encoded,
|
47 |
-
ord_encoded_features,
|
48 |
-
ord_encoded_schema,
|
49 |
ord_encoder,
|
50 |
)
|
51 |
|
52 |
|
53 |
@app.cell
|
54 |
-
def _(dataset_encoded,
|
55 |
from sklearn.linear_model import LogisticRegression
|
56 |
from sklearn.naive_bayes import BernoulliNB
|
57 |
from sklearn.tree import DecisionTreeClassifier
|
58 |
-
from sklearn.svm import SVC
|
59 |
from sklearn.model_selection import train_test_split
|
60 |
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
|
61 |
|
62 |
-
X = dataset_encoded.select(['
|
63 |
y = dataset_encoded.select(['Survival_5_years'])
|
64 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=
|
65 |
logreg = LogisticRegression()
|
66 |
y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test)
|
67 |
bnb = BernoulliNB()
|
@@ -69,8 +66,9 @@ def _(dataset_encoded, encoded_features, mo):
|
|
69 |
dectree = DecisionTreeClassifier()
|
70 |
y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test)
|
71 |
|
|
|
72 |
mo.md(f"""
|
73 |
-
|
74 |
|
75 |
Accuracy score: {accuracy_score(y_test, y_pred_logreg)}
|
76 |
|
@@ -86,7 +84,7 @@ def _(dataset_encoded, encoded_features, mo):
|
|
86 |
{classification_report(y_test, y_pred_logreg)}
|
87 |
```
|
88 |
|
89 |
-
|
90 |
|
91 |
Accuracy score: {accuracy_score(y_test, y_pred_bnb)}
|
92 |
|
@@ -102,7 +100,7 @@ def _(dataset_encoded, encoded_features, mo):
|
|
102 |
{classification_report(y_test, y_pred_bnb)}
|
103 |
```
|
104 |
|
105 |
-
|
106 |
|
107 |
Accuracy score: {accuracy_score(y_test, y_pred_dectree)}
|
108 |
|
@@ -117,12 +115,15 @@ def _(dataset_encoded, encoded_features, mo):
|
|
117 |
```
|
118 |
{classification_report(y_test, y_pred_dectree)}
|
119 |
```
|
|
|
|
|
|
|
|
|
120 |
""")
|
121 |
return (
|
122 |
BernoulliNB,
|
123 |
DecisionTreeClassifier,
|
124 |
LogisticRegression,
|
125 |
-
SVC,
|
126 |
X,
|
127 |
X_test,
|
128 |
X_train,
|
@@ -143,6 +144,96 @@ def _(dataset_encoded, encoded_features, mo):
|
|
143 |
)
|
144 |
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
if __name__ == "__main__":
|
148 |
app.run()
|
|
|
4 |
app = marimo.App(width="medium")
|
5 |
|
6 |
|
7 |
+
@app.cell
|
8 |
+
def _(mo):
|
9 |
+
mo.md(r"""# Analyzing Colorectal Cancer Dataset""")
|
10 |
+
return
|
11 |
+
|
12 |
+
|
13 |
@app.cell
|
14 |
def _():
|
15 |
import marimo as mo
|
|
|
20 |
@app.cell
|
21 |
def _(pl):
|
22 |
dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv')
|
23 |
+
# dataset.select("Tumor_Size_mm").describe()
|
24 |
return (dataset,)
|
25 |
|
26 |
|
27 |
+
@app.cell(hide_code=True)
|
28 |
def _(dataset, pl):
|
29 |
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
30 |
|
|
|
31 |
ord_encoder = OrdinalEncoder()
|
32 |
+
ord_encoded = ord_encoder.fit_transform(dataset.select('Early_Detection', 'Cancer_Stage', 'Survival_5_years'))
|
33 |
+
encoded_features = ord_encoder.get_feature_names_out(['Early_Detection', 'Cancer_Stage', 'Survival_5_years'])
|
|
|
|
|
34 |
encoded_schema = {name: pl.Int8 for name in encoded_features}
|
35 |
+
# print(encoded_schema)
|
36 |
+
dataset_encoded_parts = pl.DataFrame(ord_encoded, encoded_schema)
|
37 |
+
dataset_encoded = dataset.with_columns(dataset_encoded_parts)
|
38 |
+
# dataset_encoded
|
39 |
return (
|
40 |
OneHotEncoder,
|
41 |
OrdinalEncoder,
|
42 |
dataset_encoded,
|
43 |
dataset_encoded_parts,
|
|
|
|
|
44 |
encoded_features,
|
45 |
encoded_schema,
|
|
|
46 |
ord_encoded,
|
|
|
|
|
47 |
ord_encoder,
|
48 |
)
|
49 |
|
50 |
|
51 |
@app.cell
|
52 |
+
def _(dataset_encoded, mo):
|
53 |
from sklearn.linear_model import LogisticRegression
|
54 |
from sklearn.naive_bayes import BernoulliNB
|
55 |
from sklearn.tree import DecisionTreeClassifier
|
|
|
56 |
from sklearn.model_selection import train_test_split
|
57 |
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
|
58 |
|
59 |
+
X = dataset_encoded.select(['Tumor_Size_mm', 'Early_Detection', 'Cancer_Stage'])
|
60 |
y = dataset_encoded.select(['Survival_5_years'])
|
61 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
|
62 |
logreg = LogisticRegression()
|
63 |
y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test)
|
64 |
bnb = BernoulliNB()
|
|
|
66 |
dectree = DecisionTreeClassifier()
|
67 |
y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test)
|
68 |
|
69 |
+
|
70 |
mo.md(f"""
|
71 |
+
## Logistic Regression
|
72 |
|
73 |
Accuracy score: {accuracy_score(y_test, y_pred_logreg)}
|
74 |
|
|
|
84 |
{classification_report(y_test, y_pred_logreg)}
|
85 |
```
|
86 |
|
87 |
+
## Bernoulli Naive Bayes
|
88 |
|
89 |
Accuracy score: {accuracy_score(y_test, y_pred_bnb)}
|
90 |
|
|
|
100 |
{classification_report(y_test, y_pred_bnb)}
|
101 |
```
|
102 |
|
103 |
+
## Decision Tree Classifier
|
104 |
|
105 |
Accuracy score: {accuracy_score(y_test, y_pred_dectree)}
|
106 |
|
|
|
115 |
```
|
116 |
{classification_report(y_test, y_pred_dectree)}
|
117 |
```
|
118 |
+
|
119 |
+
## Conclusion
|
120 |
+
|
121 |
+
{mo.callout("Classifiers don't work well with this dataset, let's try something else.", kind='info')}
|
122 |
""")
|
123 |
return (
|
124 |
BernoulliNB,
|
125 |
DecisionTreeClassifier,
|
126 |
LogisticRegression,
|
|
|
127 |
X,
|
128 |
X_test,
|
129 |
X_train,
|
|
|
144 |
)
|
145 |
|
146 |
|
147 |
+
@app.cell
|
148 |
+
def _(OrdinalEncoder, dataset, mo, pl):
|
149 |
+
def _():
|
150 |
+
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
|
151 |
+
from sklearn.svm import SVC
|
152 |
+
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
|
153 |
+
import altair as alt
|
154 |
+
|
155 |
+
genmut_encoder = OrdinalEncoder()
|
156 |
+
genmut_encoded = genmut_encoder.fit_transform(dataset.select('Genetic_Mutation'))
|
157 |
+
genmut_features = genmut_encoder.get_feature_names_out(['Genetic_Mutation'])
|
158 |
+
encoded_schema = {name: pl.Int8 for name in genmut_features}
|
159 |
+
dataset_encoded_parts = pl.DataFrame(genmut_encoded, encoded_schema)
|
160 |
+
dataset_encoded = dataset.with_columns(dataset_encoded_parts)
|
161 |
+
# Use samples since dataset is way too big to run locally
|
162 |
+
dataset_encoded = dataset_encoded.sample(3000, seed=11)
|
163 |
+
|
164 |
+
X = dataset_encoded.select(['Tumor_Size_mm', 'Genetic_Mutation'])
|
165 |
+
y = dataset_encoded.select(['Cancer_Stage']).to_series()
|
166 |
+
|
167 |
+
kmeans = KMeans(n_clusters=3, random_state=11)
|
168 |
+
spec = SpectralClustering(n_clusters=3, random_state=11)
|
169 |
+
|
170 |
+
labels_kmeans = kmeans.fit_predict(X)
|
171 |
+
labels_spec = spec.fit_predict(X)
|
172 |
+
|
173 |
+
# df_kmeans_parts = pl.DataFrame(labels_kmeans, schema=pl.String)
|
174 |
+
df_kmeans = X.with_columns(pl.lit(labels_kmeans, dtype=pl.String).alias('kmeans_cluster'))
|
175 |
+
|
176 |
+
return mo.vstack([
|
177 |
+
mo.md(f"""
|
178 |
+
## K-Means Clustering
|
179 |
+
|
180 |
+
### External Metrics
|
181 |
+
|
182 |
+
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_kmeans)}
|
183 |
+
|
184 |
+
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_kmeans)}
|
185 |
+
|
186 |
+
Homogeneity: {homogeneity_score(y, labels_kmeans)}
|
187 |
+
|
188 |
+
Completeness: {completeness_score(y, labels_kmeans)}
|
189 |
+
|
190 |
+
V-measure: {v_measure_score(y, labels_kmeans)}
|
191 |
+
|
192 |
+
### Internal Metrics
|
193 |
+
|
194 |
+
Silhouette Score: {silhouette_score(X, labels_kmeans)}
|
195 |
+
|
196 |
+
Davies-Bouldin Index: {davies_bouldin_score(X, labels_kmeans)}
|
197 |
+
|
198 |
+
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_kmeans)}
|
199 |
+
|
200 |
+
|
201 |
+
## Spectral Clustering
|
202 |
+
|
203 |
+
### External Metrics
|
204 |
+
|
205 |
+
Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_spec)}
|
206 |
+
|
207 |
+
Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_spec)}
|
208 |
+
|
209 |
+
Homogeneity: {homogeneity_score(y, labels_spec)}
|
210 |
+
|
211 |
+
Completeness: {completeness_score(y, labels_spec)}
|
212 |
+
|
213 |
+
V-measure: {v_measure_score(y, labels_spec)}
|
214 |
+
|
215 |
+
### Internal Metrics
|
216 |
+
|
217 |
+
Silhouette Score: {silhouette_score(X, labels_spec)}
|
218 |
+
|
219 |
+
Davies-Bouldin Index: {davies_bouldin_score(X, labels_spec)}
|
220 |
+
|
221 |
+
Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_spec)}
|
222 |
+
|
223 |
+
{mo.callout("Unsupervised clustering techniques do perform reasonably well, but does not correlate to other labels.", 'info')}
|
224 |
+
"""),
|
225 |
+
|
226 |
+
alt.Chart(df_kmeans, autosize='pad').mark_circle().encode(
|
227 |
+
x='Genetic_Mutation',
|
228 |
+
y='Tumor_Size_mm',
|
229 |
+
color='kmeans_cluster'
|
230 |
+
)
|
231 |
+
])
|
232 |
+
|
233 |
+
|
234 |
+
_()
|
235 |
+
return
|
236 |
+
|
237 |
|
238 |
if __name__ == "__main__":
|
239 |
app.run()
|