computerscience-person commited on
Commit
b8863d9
·
1 Parent(s): 689e49e

Add unsupervised clustering models.

Browse files
Files changed (2) hide show
  1. .dockerignore +1 -0
  2. app.py +115 -24
.dockerignore CHANGED
@@ -2,3 +2,4 @@
2
  ./dataset/diabetes_012_health_indicators_BRFSS2015.csv
3
  ./dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
4
  ./dataset/final_phone_preferences_india.csv
 
 
2
  ./dataset/diabetes_012_health_indicators_BRFSS2015.csv
3
  ./dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
4
  ./dataset/final_phone_preferences_india.csv
5
+ ./__pycache__/
app.py CHANGED
@@ -4,6 +4,12 @@ __generated_with = "0.11.17"
4
  app = marimo.App(width="medium")
5
 
6
 
 
 
 
 
 
 
7
  @app.cell
8
  def _():
9
  import marimo as mo
@@ -14,54 +20,45 @@ def _():
14
  @app.cell
15
  def _(pl):
16
  dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv')
17
- dataset
18
  return (dataset,)
19
 
20
 
21
- @app.cell
22
  def _(dataset, pl):
23
  from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
24
 
25
- encoder = OneHotEncoder(sparse_output=False)
26
  ord_encoder = OrdinalEncoder()
27
- encoded = encoder.fit_transform(dataset.select(['Obesity_BMI', 'Cancer_Stage']))
28
- ord_encoded = ord_encoder.fit_transform(dataset.select('Survival_5_years'))
29
- encoded_features = encoder.get_feature_names_out(['Obesity_BMI', 'Cancer_Stage'])
30
- ord_encoded_features = ord_encoder.get_feature_names_out(['Survival_5_years'])
31
  encoded_schema = {name: pl.Int8 for name in encoded_features}
32
- ord_encoded_schema = {name: pl.Int8 for name in ord_encoded_features}
33
- dataset_encoded_parts = pl.DataFrame(encoded, schema=encoded_schema)
34
- dataset_ord_encoded_parts = pl.DataFrame(ord_encoded, schema=ord_encoded_schema)
35
- dataset_encoded = dataset.with_columns(dataset_encoded_parts).with_columns(dataset_ord_encoded_parts)
36
  return (
37
  OneHotEncoder,
38
  OrdinalEncoder,
39
  dataset_encoded,
40
  dataset_encoded_parts,
41
- dataset_ord_encoded_parts,
42
- encoded,
43
  encoded_features,
44
  encoded_schema,
45
- encoder,
46
  ord_encoded,
47
- ord_encoded_features,
48
- ord_encoded_schema,
49
  ord_encoder,
50
  )
51
 
52
 
53
  @app.cell
54
- def _(dataset_encoded, encoded_features, mo):
55
  from sklearn.linear_model import LogisticRegression
56
  from sklearn.naive_bayes import BernoulliNB
57
  from sklearn.tree import DecisionTreeClassifier
58
- from sklearn.svm import SVC
59
  from sklearn.model_selection import train_test_split
60
  from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
61
 
62
- X = dataset_encoded.select(['Age', 'Tumor_Size_mm'] + encoded_features.tolist())
63
  y = dataset_encoded.select(['Survival_5_years'])
64
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)
65
  logreg = LogisticRegression()
66
  y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test)
67
  bnb = BernoulliNB()
@@ -69,8 +66,9 @@ def _(dataset_encoded, encoded_features, mo):
69
  dectree = DecisionTreeClassifier()
70
  y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test)
71
 
 
72
  mo.md(f"""
73
- # Logistic Regression
74
 
75
  Accuracy score: {accuracy_score(y_test, y_pred_logreg)}
76
 
@@ -86,7 +84,7 @@ def _(dataset_encoded, encoded_features, mo):
86
  {classification_report(y_test, y_pred_logreg)}
87
  ```
88
 
89
- # Bernoulli Naive Bayes
90
 
91
  Accuracy score: {accuracy_score(y_test, y_pred_bnb)}
92
 
@@ -102,7 +100,7 @@ def _(dataset_encoded, encoded_features, mo):
102
  {classification_report(y_test, y_pred_bnb)}
103
  ```
104
 
105
- # Decision Tree Classifier
106
 
107
  Accuracy score: {accuracy_score(y_test, y_pred_dectree)}
108
 
@@ -117,12 +115,15 @@ def _(dataset_encoded, encoded_features, mo):
117
  ```
118
  {classification_report(y_test, y_pred_dectree)}
119
  ```
 
 
 
 
120
  """)
121
  return (
122
  BernoulliNB,
123
  DecisionTreeClassifier,
124
  LogisticRegression,
125
- SVC,
126
  X,
127
  X_test,
128
  X_train,
@@ -143,6 +144,96 @@ def _(dataset_encoded, encoded_features, mo):
143
  )
144
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  if __name__ == "__main__":
148
  app.run()
 
4
  app = marimo.App(width="medium")
5
 
6
 
7
+ @app.cell
8
+ def _(mo):
9
+ mo.md(r"""# Analyzing Colorectal Cancer Dataset""")
10
+ return
11
+
12
+
13
  @app.cell
14
  def _():
15
  import marimo as mo
 
20
  @app.cell
21
  def _(pl):
22
  dataset = pl.read_csv('./dataset/colorectal_cancer_dataset.csv')
23
+ # dataset.select("Tumor_Size_mm").describe()
24
  return (dataset,)
25
 
26
 
27
+ @app.cell(hide_code=True)
28
  def _(dataset, pl):
29
  from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
30
 
 
31
  ord_encoder = OrdinalEncoder()
32
+ ord_encoded = ord_encoder.fit_transform(dataset.select('Early_Detection', 'Cancer_Stage', 'Survival_5_years'))
33
+ encoded_features = ord_encoder.get_feature_names_out(['Early_Detection', 'Cancer_Stage', 'Survival_5_years'])
 
 
34
  encoded_schema = {name: pl.Int8 for name in encoded_features}
35
+ # print(encoded_schema)
36
+ dataset_encoded_parts = pl.DataFrame(ord_encoded, encoded_schema)
37
+ dataset_encoded = dataset.with_columns(dataset_encoded_parts)
38
+ # dataset_encoded
39
  return (
40
  OneHotEncoder,
41
  OrdinalEncoder,
42
  dataset_encoded,
43
  dataset_encoded_parts,
 
 
44
  encoded_features,
45
  encoded_schema,
 
46
  ord_encoded,
 
 
47
  ord_encoder,
48
  )
49
 
50
 
51
  @app.cell
52
+ def _(dataset_encoded, mo):
53
  from sklearn.linear_model import LogisticRegression
54
  from sklearn.naive_bayes import BernoulliNB
55
  from sklearn.tree import DecisionTreeClassifier
 
56
  from sklearn.model_selection import train_test_split
57
  from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
58
 
59
+ X = dataset_encoded.select(['Tumor_Size_mm', 'Early_Detection', 'Cancer_Stage'])
60
  y = dataset_encoded.select(['Survival_5_years'])
61
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
62
  logreg = LogisticRegression()
63
  y_pred_logreg = logreg.fit(X_train, y_train).predict(X_test)
64
  bnb = BernoulliNB()
 
66
  dectree = DecisionTreeClassifier()
67
  y_pred_dectree = dectree.fit(X_train, y_train).predict(X_test)
68
 
69
+
70
  mo.md(f"""
71
+ ## Logistic Regression
72
 
73
  Accuracy score: {accuracy_score(y_test, y_pred_logreg)}
74
 
 
84
  {classification_report(y_test, y_pred_logreg)}
85
  ```
86
 
87
+ ## Bernoulli Naive Bayes
88
 
89
  Accuracy score: {accuracy_score(y_test, y_pred_bnb)}
90
 
 
100
  {classification_report(y_test, y_pred_bnb)}
101
  ```
102
 
103
+ ## Decision Tree Classifier
104
 
105
  Accuracy score: {accuracy_score(y_test, y_pred_dectree)}
106
 
 
115
  ```
116
  {classification_report(y_test, y_pred_dectree)}
117
  ```
118
+
119
+ ## Conclusion
120
+
121
+ {mo.callout("Classifiers don't work well with this dataset, let's try something else.", kind='info')}
122
  """)
123
  return (
124
  BernoulliNB,
125
  DecisionTreeClassifier,
126
  LogisticRegression,
 
127
  X,
128
  X_test,
129
  X_train,
 
144
  )
145
 
146
 
147
+ @app.cell
148
+ def _(OrdinalEncoder, dataset, mo, pl):
149
+ def _():
150
+ from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
151
+ from sklearn.svm import SVC
152
+ from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
153
+ import altair as alt
154
+
155
+ genmut_encoder = OrdinalEncoder()
156
+ genmut_encoded = genmut_encoder.fit_transform(dataset.select('Genetic_Mutation'))
157
+ genmut_features = genmut_encoder.get_feature_names_out(['Genetic_Mutation'])
158
+ encoded_schema = {name: pl.Int8 for name in genmut_features}
159
+ dataset_encoded_parts = pl.DataFrame(genmut_encoded, encoded_schema)
160
+ dataset_encoded = dataset.with_columns(dataset_encoded_parts)
161
+ # Use samples since dataset is way too big to run locally
162
+ dataset_encoded = dataset_encoded.sample(3000, seed=11)
163
+
164
+ X = dataset_encoded.select(['Tumor_Size_mm', 'Genetic_Mutation'])
165
+ y = dataset_encoded.select(['Cancer_Stage']).to_series()
166
+
167
+ kmeans = KMeans(n_clusters=3, random_state=11)
168
+ spec = SpectralClustering(n_clusters=3, random_state=11)
169
+
170
+ labels_kmeans = kmeans.fit_predict(X)
171
+ labels_spec = spec.fit_predict(X)
172
+
173
+ # df_kmeans_parts = pl.DataFrame(labels_kmeans, schema=pl.String)
174
+ df_kmeans = X.with_columns(pl.lit(labels_kmeans, dtype=pl.String).alias('kmeans_cluster'))
175
+
176
+ return mo.vstack([
177
+ mo.md(f"""
178
+ ## K-Means Clustering
179
+
180
+ ### External Metrics
181
+
182
+ Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_kmeans)}
183
+
184
+ Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_kmeans)}
185
+
186
+ Homogeneity: {homogeneity_score(y, labels_kmeans)}
187
+
188
+ Completeness: {completeness_score(y, labels_kmeans)}
189
+
190
+ V-measure: {v_measure_score(y, labels_kmeans)}
191
+
192
+ ### Internal Metrics
193
+
194
+ Silhouette Score: {silhouette_score(X, labels_kmeans)}
195
+
196
+ Davies-Bouldin Index: {davies_bouldin_score(X, labels_kmeans)}
197
+
198
+ Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_kmeans)}
199
+
200
+
201
+ ## Spectral Clustering
202
+
203
+ ### External Metrics
204
+
205
+ Adjusted Rand Index (ARI): {adjusted_rand_score(y, labels_spec)}
206
+
207
+ Normalized Mutual Information (NMI): {normalized_mutual_info_score(y, labels_spec)}
208
+
209
+ Homogeneity: {homogeneity_score(y, labels_spec)}
210
+
211
+ Completeness: {completeness_score(y, labels_spec)}
212
+
213
+ V-measure: {v_measure_score(y, labels_spec)}
214
+
215
+ ### Internal Metrics
216
+
217
+ Silhouette Score: {silhouette_score(X, labels_spec)}
218
+
219
+ Davies-Bouldin Index: {davies_bouldin_score(X, labels_spec)}
220
+
221
+ Calinski-Harabasz Index: {calinski_harabasz_score(X, labels_spec)}
222
+
223
+ {mo.callout("Unsupervised clustering techniques do perform reasonably well, but does not correlate to other labels.", 'info')}
224
+ """),
225
+
226
+ alt.Chart(df_kmeans, autosize='pad').mark_circle().encode(
227
+ x='Genetic_Mutation',
228
+ y='Tumor_Size_mm',
229
+ color='kmeans_cluster'
230
+ )
231
+ ])
232
+
233
+
234
+ _()
235
+ return
236
+
237
 
238
  if __name__ == "__main__":
239
  app.run()