-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.py
121 lines (97 loc) · 3.09 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
import math
from numpy import linalg as LA
def dataWithLabel(data, label, labels):
indices = []
for i in range(0, len(data)):
if labels[i] == label:
indices.append(i)
return data[indices]
def initialCentroids(clusterCenters, data, labels):
centroidsMap = {}
for i in range(len(clusterCenters)):
C_i = clusterCenters[i]
S_i = dataWithLabel(data, i, labels)
centroidsMap[i] = Cluster(i, C_i, S_i)
return centroidsMap
class Cluster:
def __init__(self, label, centroid, points):
self.label = label
self.centroid = centroid
self.points = points
def isViolation(self, theta):
threshold = theta * LA.norm(self.centroid)
for point in self.points:
if LA.norm(point - self.centroid) > threshold:
return True
return False
class ClusterSet:
def __init__(self, data):
self.data = data
self.clusterMap = None
def normalize(self):
for i in range(len(self.data)):
consumption = sum(self.data[i])
self.data[i] /= (consumption)
def voltage_normalize(self):
for i in range(len(self.data)):
for j in range(len(self.data[i])):
self.data[i][j] -= 1
def maxLabel(self):
maxLabel = None
for label in self.clusterMap.keys():
if (maxLabel == None or int(label) > maxLabel):
maxLabel = int(label)
return maxLabel
def fitData(self, K):
km = None
if not self.clusterMap:
km = KMeans(n_clusters=K, init='k-means++', max_iter=100, n_init=1)
else:
# use centroids from last iteration
centroids = self.getCentroids()
km = KMeans(n_clusters=K, init=centroids, max_iter=100, n_init=1)
km.fit(self.data)
self.clusterMap = initialCentroids(km.cluster_centers_, self.data, km.labels_)
def getCentroids(self):
centroids = []
for cluster in self.clusterMap.values():
centroids.append(cluster.centroid)
return np.array(centroids)
def getCluster(self, label):
return self.clusterMap[label]
def findViolations(self, theta):
indexes = []
for cluster in self.clusterMap.values():
if cluster.isViolation(theta):
indexes.append(cluster.label)
return indexes
def splitLabel(self, label):
km = KMeans(n_clusters=2, init='k-means++', max_iter=100, n_init=1)
cluster = self.getCluster(label)
km.fit(cluster.points)
labelOffset = self.maxLabel() + 1
S_0 = dataWithLabel(cluster.points, 0, km.labels_)
C_0 = km.cluster_centers_[0]
cluster0 = Cluster(labelOffset, C_0, S_0)
self.clusterMap[labelOffset] = cluster0
S_1 = dataWithLabel(cluster.points, 1, km.labels_)
C_1 = km.cluster_centers_[1]
cluster1 = Cluster(labelOffset + 1, C_1, S_1)
self.clusterMap[labelOffset + 1] = cluster1
del self.clusterMap[label]
def smallestCluster(self):
smallest = None
for label in self.clusterMap:
if smallest == None or \
len(self.clusterMap[label].points) < len(self.clusterMap[smallest].points):
smallest = label
return smallest
def largestCluster(self):
largest = None
for label in self.clusterMap:
if largest == None or \
len(self.clusterMap[label].points) > len(self.clusterMap[largest].points):
largest = label
return largest