File size: 1,300 Bytes
3cab2dd
932b3cb
 
3cab2dd
 
932b3cb
 
3cab2dd
 
 
 
 
 
 
fcc4124
3cab2dd
fcc4124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cab2dd
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from dataclasses import dataclass
import numpy as np

from cluster.distance import euclidean
from cluster.clusterer import Clusterer


@dataclass
class Kmeans(Clusterer):
    k: int
    max_iter: int

    def build(
        self,
        X: np.array,
    ):
        # randomly initialize centroids
        centroids = X[np.random.choice(
            X.shape[0],
            self.k,
            replace=False,
        )]

        # Calculate Euclidean distance between each data point and each centroid
        # then assign each point to its closest cluster
        clusters = self.assign_clusters(X, centroids)
        centroids = self.update_centroids(self.k, X, clusters)

    @staticmethod
    def assign_clusters(
        X: np.array,
        centroids: np.array,
    ) -> np.array:
        distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
        clusts = np.argmin(distances, axis=0)
        return clusts

    @staticmethod
    def update_centroids(
        k: int,
        X: np.array,
        clusters: np.array,
    ) -> np.array:
        centroids = np.zeros((k, X.shape[1]))
        for i in range(k):
            centroids[i] = X[clusters == i].mean(axis=0)
        return centroids

    def label():
        ...

    def main(self):
        return self.from_dict()