Jensen-holm commited on
Commit
3cab2dd
·
1 Parent(s): 40b389c

working on kmeans

Browse files
app.py CHANGED
@@ -37,17 +37,14 @@ def index():
37
  algorithm = options[request.json["algorithm"]]
38
  args = request.json["arguments"]
39
 
40
- # in the future instead of a random data set
41
- # we should do a more real one like palmer penguins
42
-
43
  X, y = iris()
44
- return jsonify(
45
- algorithm(
46
- X=X,
47
- y=y,
48
- args=args,
49
- )
50
  )
 
51
 
52
 
53
  if __name__ == "__main__":
 
37
  algorithm = options[request.json["algorithm"]]
38
  args = request.json["arguments"]
39
 
40
+ # using the iris data set for every algorithm
 
 
41
  X, y = iris()
42
+ result = algorithm(
43
+ X=X,
44
+ y=y,
45
+ args=args,
 
 
46
  )
47
+ return jsonify(result)
48
 
49
 
50
  if __name__ == "__main__":
cluster/clusterer.py CHANGED
@@ -1,5 +1,6 @@
1
  from dataclasses import dataclass
2
  from typing import Callable
 
3
  import numpy as np
4
 
5
 
@@ -8,17 +9,15 @@ class Clusterer:
8
  cluster_func: Callable
9
  options: dict
10
 
11
- accuracy: float = 0
12
-
13
- @staticmethod
14
- def label():
15
- return
16
-
17
- def eval(y_pred, y_true) -> None:
18
- return
19
 
20
  @classmethod
21
- def from_dict(cls, dct):
22
  return cls(**dct)
23
 
24
  def to_dict(self):
 
1
  from dataclasses import dataclass
2
  from typing import Callable
3
+
4
  import numpy as np
5
 
6
 
 
9
  cluster_func: Callable
10
  options: dict
11
 
12
+ def eval(
13
+ self,
14
+ pred_labels: np.array,
15
+ true_labels: np.array,
16
+ ) -> None:
17
+ ...
 
 
18
 
19
  @classmethod
20
+ def from_dict(cls, dct: dict):
21
  return cls(**dct)
22
 
23
  def to_dict(self):
cluster/distance.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def euclidean(
5
+ point: np.array,
6
+ data: np.array,
7
+ ) -> np.array:
8
+ """
9
+ Computed the euclidean distance
10
+ between a point and the rest
11
+ of the dataset
12
+ point dims: (m,)
13
+ data dims: (n, m)
14
+ output dims: (n,)
15
+ """
16
+ return np.sqrt(np.sum((point - data)**2), aixs=1)
cluster/kmeans.py CHANGED
@@ -1,13 +1,43 @@
 
1
  import numpy as np
2
 
 
 
3
 
4
- def kmeans(
5
- X_train: np.array,
6
- y_train: np.array,
7
- args: dict,
8
- ):
9
- # for this alg, the only argument
10
- # is the number of clusters, k
11
- # and max iterations
12
 
13
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
  import numpy as np
3
 
4
+ from cluster.distance import euclidean
5
+ from cluster.clusterer import Clusterer
6
 
 
 
 
 
 
 
 
 
7
 
8
+ @dataclass
9
+ class Kmeans(Clusterer):
10
+ k: int
11
+ max_iter: int
12
+
13
+ def build(
14
+ self,
15
+ X_train: np.array,
16
+ ):
17
+ # Randomly select centroid start points, uniformly distributed across the domain of the dataset
18
+ minimum = np.min(X_train, axis=0)
19
+ maximum = np.max(X_train, axis=0)
20
+ centroids = [np.uniform(minimum, maximum) for _ in range(self.k)]
21
+
22
+ # loop through and cluster data
23
+ prev_centroids = 0
24
+ iteration = 0
25
+ while True:
26
+ sorted_pts = [[] for _ in range(self.k)]
27
+ for x in X_train:
28
+ dists = euclidean(x, centroids)
29
+
30
+ if not np.not_equal(
31
+ centroids,
32
+ prev_centroids,
33
+ ).any():
34
+ break
35
+ if not iteration < self.k:
36
+ break
37
+ iteration += 1
38
+
39
+ def label():
40
+ ...
41
+
42
+ def main(self):
43
+ return self.from_dict()
cluster/kmedoids.py CHANGED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import numpy as np
3
+
4
+ from cluster.clusterer import Clusterer
5
+
6
+
7
+ @dataclass
8
+ class Kmedoids(Clusterer):
9
+ k: int
10
+
11
+ def build(self, X_train: np.array):
12
+ ...
13
+
14
+ def label():
15
+ ...
16
+
17
+ def main():
18
+ ...
cluster/main.py CHANGED
@@ -1,7 +1,7 @@
1
  from sklearn.model_selection import train_test_split
2
- from typing import Callable
3
  import numpy as np
4
 
 
5
  # for determing which clustering funciton to call
6
  from cluster.opts import clustering_methods
7
 
@@ -10,14 +10,8 @@ def main(
10
  X: np.array,
11
  y: np.array,
12
  args: dict,
13
- ):
14
-
15
- cluster_alg: Callable = clustering_methods[args["algorithm"]]
16
- X_train, X_test, y_train, y_test = train_test_split(
17
- X,
18
- y,
19
- test_size=0.2,
20
- random_state=8675309,
21
- )
22
-
23
- return
 
1
  from sklearn.model_selection import train_test_split
 
2
  import numpy as np
3
 
4
+ from cluster.clusterer import Clusterer
5
  # for determing which clustering funciton to call
6
  from cluster.opts import clustering_methods
7
 
 
10
  X: np.array,
11
  y: np.array,
12
  args: dict,
13
+ ) -> dict:
14
+ cluster_alg: Clusterer = clustering_methods[args["algorithm"]]
15
+ model = cluster_alg.main(X, args)
16
+ model.eval(X, y)
17
+ return model.to_dict()
 
 
 
 
 
 
cluster/opts.py CHANGED
@@ -1,3 +1,9 @@
1
- clustering_methods = {
2
- "kmeans": "KMeans",
 
 
 
 
 
 
3
  }
 
1
+ from cluster.clusterer import Clusterer
2
+ from cluster.kmedoids import Kmedoids
3
+ from cluster.kmeans import Kmeans
4
+
5
+
6
+ clustering_methods: dict[str, Clusterer] = {
7
+ "kmeans": Kmeans,
8
+ "kmedoids": Kmedoids,
9
  }
example/kmeans.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ ENDPOINT: str = "https://data-mining-from-scratch-backend.onrender.com/"
5
+
6
+ request_params = {
7
+ "algorithm": "kmeans",
8
+ "arguments": {
9
+ "k": 3,
10
+ "max_iter": 10,
11
+ },
12
+ }
13
+
14
+
15
+ headers = {
16
+ "Content-Type": "application/json",
17
+ }
18
+
19
+ r = requests.post(
20
+ ENDPOINT,
21
+ headers=headers,
22
+ data=json.dumps(request_params),
23
+ )
24
+
25
+
26
+ if __name__ == "__main__":
27
+ print(r.json())
neural_network/neural_network.py CHANGED
@@ -25,7 +25,6 @@ class NeuralNetwork:
25
  return self.compute_node(n1, self.w2, self.b2, self.activation_func)
26
 
27
  def set_loss_hist(self, loss_hist: list) -> None:
28
- assert (isinstance(loss_hist, list))
29
  self.loss_history = loss_hist
30
 
31
  def eval(self, X_test, y_test) -> None:
 
25
  return self.compute_node(n1, self.w2, self.b2, self.activation_func)
26
 
27
  def set_loss_hist(self, loss_hist: list) -> None:
 
28
  self.loss_history = loss_hist
29
 
30
  def eval(self, X_test, y_test) -> None: