-
Notifications
You must be signed in to change notification settings - Fork 10
/
6. step4.py
82 lines (55 loc) · 2.09 KB
/
6. step4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import codecademylib3_seaborn
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from copy import deepcopy
iris = datasets.load_iris()
samples = iris.data
x = samples[:,0]
y = samples[:,1]
sepal_length_width = np.array(list(zip(x, y)))
# Step 1: Place K random centroids
k = 3
centroids_x = np.random.uniform(min(x), max(x), size=k)
centroids_y = np.random.uniform(min(y), max(y), size=k)
centroids = np.array(list(zip(centroids_x, centroids_y)))
def distance(a, b):
one = (a[0] - b[0]) ** 2
two = (a[1] - b[1]) ** 2
distance = (one + two) ** 0.5
return distance
# To store the value of centroids when it updates
centroids_old = np.zeros(centroids.shape)
# Cluster labeles (either 0, 1, or 2)
labels = np.zeros(len(samples))
distances = np.zeros(3)
# Initialize error:
error = np.zeros(3)
error[0] = distance(centroids[0], centroids_old[0])
error[1] = distance(centroids[1], centroids_old[1])
error[2] = distance(centroids[2], centroids_old[2])
# Repeat Steps 2 and 3 until convergence:
while error.all() != 0:
# Step 2: Assign samples to nearest centroid
for i in range(len(samples)):
distances[0] = distance(sepal_length_width[i], centroids[0])
distances[1] = distance(sepal_length_width[i], centroids[1])
distances[2] = distance(sepal_length_width[i], centroids[2])
cluster = np.argmin(distances)
labels[i] = cluster
# Step 3: Update centroids
centroids_old = deepcopy(centroids)
for i in range(3):
points = [sepal_length_width[j] for j in range(len(sepal_length_width)) if labels[j] == i]
centroids[i] = np.mean(points, axis=0)
error[0] = distance(centroids[0], centroids_old[0])
error[1] = distance(centroids[1], centroids_old[1])
error[2] = distance(centroids[2], centroids_old[2])
colors = ['r', 'g', 'b']
for i in range(k):
points = np.array([sepal_length_width[j] for j in range(len(samples)) if labels[j] == i])
plt.scatter(points[:, 0], points[:, 1], c=colors[i], alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='D', s=150)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.show()