@@ -16,31 +16,24 @@ const (
1616type kmeansClusterer struct {
1717 iterations int
1818 number int
19- dimension int
2019
2120 // Variables keeping count of changes of points' membership every iteration. User as a stopping condition.
2221 changes , oldchanges , counter , threshold int
2322
2423 // For online learning only
25- alpha float64
24+ alpha float64
25+ dimension int
2626
2727 distance DistanceFunc
2828
29- // Mapping from training set points to clusters' numbers.
30- a map [int ]int
29+ // a holds the mapping of data point indices to cluster numbers, b holds the sizes of each cluster
30+ mu sync.RWMutex
31+ a , b []int
3132
32- // Mapping from clusters' numbers to set of points they contain.
33- b [][]int
33+ // variables holding values of centroids of each clusters
34+ m , n [][]float64
3435
35- // Mapping from clusters' numbers to their means
36- m [][]float64
37-
38- // Training set
3936 d [][]float64
40-
41- // Computed clusters. Access is synchronized.
42- mu sync.RWMutex
43- c []HardCluster
4437}
4538
4639func KmeansClusterer (iterations , clusters int , distance DistanceFunc ) (HardClusterer , error ) {
@@ -88,8 +81,12 @@ func (c *kmeansClusterer) Learn(data [][]float64) error {
8881
8982 c .d = data
9083
91- c .a = make (map [int ]int , len (data ))
92- c .b = make ([][]int , c .number )
84+ c .a = make ([]int , len (data ))
85+ c .b = make ([]int , c .number )
86+
87+ for i := 0 ; i < len (data ); i ++ {
88+ c .a [i ] = - 1
89+ }
9390
9491 c .counter = 0
9592 c .threshold = CHANGES_THRESHOLD
@@ -98,69 +95,60 @@ func (c *kmeansClusterer) Learn(data [][]float64) error {
9895
9996 c .initializeMeansWithData ()
10097
101- for i := 0 ; i < c .iterations && c .notConverged () ; i ++ {
98+ for i := 0 ; i < c .iterations && c .counter != c . threshold ; i ++ {
10299 c .run ()
100+ c .check ()
103101 }
104102
105- var wg sync.WaitGroup
106- {
107- wg .Add (c .number )
108- }
109-
110- for j := 0 ; j < c .number ; j ++ {
111- go func (n int ) {
112- defer wg .Done ()
113-
114- c .c [n ] = make ([][]float64 , len (c .b [n ]))
115-
116- for k := 0 ; k < len (c .b [n ]); k ++ {
117- c.c [n ][k ] = c .d [c.b [n ][k ]]
118- }
119- }(j )
120- }
121-
122- wg .Wait ()
123-
124103 c .mu .Unlock ()
125104
126- c .a = nil
127- c .b = nil
105+ c .n = nil
128106
129107 return nil
130108}
131109
132- func (c * kmeansClusterer ) Guesses () []HardCluster {
110+ func (c * kmeansClusterer ) Sizes () []int {
111+ c .mu .RLock ()
112+ defer c .mu .RUnlock ()
113+
114+ return c .b
115+ }
116+
117+ func (c * kmeansClusterer ) Guesses () []int {
133118 c .mu .RLock ()
134119 defer c .mu .RUnlock ()
135120
136- return c .c
121+ return c .a
137122}
138123
139- func (c * kmeansClusterer ) Predict (p []float64 ) HardCluster {
124+ func (c * kmeansClusterer ) Predict (p []float64 ) int {
140125 var (
141- l HardCluster
126+ l int
142127 d float64
143- m float64 = math . MaxFloat64
128+ m float64 = c . distance ( p , c . m [ 0 ])
144129 )
145130
146- for i := 0 ; i < len ( c . c ) ; i ++ {
131+ for i := 1 ; i < c . number ; i ++ {
147132 if d = c .distance (p , c .m [i ]); d < m {
148133 m = d
149- l = c . c [ i ]
134+ l = i
150135 }
151136 }
152137
153138 return l
154139}
155140
156- func (c * kmeansClusterer ) Online (observations chan []float64 , done chan struct {}, callback func ([] float64 , int )) {
141+ func (c * kmeansClusterer ) Online (observations chan []float64 , done chan struct {}) chan * HCEvent {
157142 c .mu .Lock ()
158143
159144 var (
160- l , f int = len (c .m ), len (c .m [0 ])
161- h float64 = 1 - c .alpha
145+ r chan * HCEvent = make (chan * HCEvent )
146+ l , f int = len (c .m ), len (c .m [0 ])
147+ h float64 = 1 - c .alpha
162148 )
163149
150+ c .b = make ([]int , c .number )
151+
164152 /* The first step of online learning is adjusting the centroids by finding the one closes to new data point
165153 * and modifying it's location using given alpha. Once the client quits sending new data, the actual clusters
166154 * are computed and the mutex is unlocked. */
@@ -182,7 +170,10 @@ func (c *kmeansClusterer) Online(observations chan []float64, done chan struct{}
182170 }
183171 }
184172
185- go callback (o , k )
173+ r <- & HCEvent {
174+ Cluster : k ,
175+ Observation : o ,
176+ }
186177
187178 for i := 0 ; i < f ; i ++ {
188179 c.m [k ][i ] = c .alpha * o [i ] + h * c.m [k ][i ]
@@ -193,13 +184,10 @@ func (c *kmeansClusterer) Online(observations chan []float64, done chan struct{}
193184 go func () {
194185 var (
195186 n int
196- l int = len (c .d ) / c .number
197187 d , m float64
198188 )
199189
200- for i := 0 ; i < c .number ; i ++ {
201- c .c [n ] = make ([][]float64 , 0 , l )
202- }
190+ c .a = make ([]int , len (c .d ))
203191
204192 for i := 0 ; i < len (c .d ); i ++ {
205193 m = c .distance (c .d [i ], c .m [0 ])
@@ -212,7 +200,8 @@ func (c *kmeansClusterer) Online(observations chan []float64, done chan struct{}
212200 }
213201 }
214202
215- c .c [n ] = append (c .c [n ], c .d [i ])
203+ c .a [i ] = n
204+ c .b [n ]++
216205 }
217206
218207 c .mu .Unlock ()
@@ -222,12 +211,14 @@ func (c *kmeansClusterer) Online(observations chan []float64, done chan struct{}
222211 }
223212 }
224213 }()
214+
215+ return r
225216}
226217
227218// private
228219func (c * kmeansClusterer ) initializeMeansWithData () {
229220 c .m = make ([][]float64 , c .number )
230- c .c = make ([]HardCluster , c .number )
221+ c .n = make ([][] float64 , c .number )
231222
232223 rand .Seed (time .Now ().UTC ().Unix ())
233224
@@ -264,11 +255,13 @@ func (c *kmeansClusterer) initializeMeansWithData() {
264255 c .m [i ] = c .d [k ]
265256 }
266257
258+ for i := 0 ; i < c .number ; i ++ {
259+ c .n [i ] = make ([]float64 , len (c .m [0 ]))
260+ }
267261}
268262
269263func (c * kmeansClusterer ) initializeMeans () {
270264 c .m = make ([][]float64 , c .number )
271- c .c = make ([]HardCluster , c .number )
272265
273266 rand .Seed (time .Now ().UTC ().Unix ())
274267
@@ -282,56 +275,49 @@ func (c *kmeansClusterer) initializeMeans() {
282275
283276func (c * kmeansClusterer ) run () {
284277 var (
285- l , n int
278+ n , l int = 0 , len ( c . m [ 0 ])
286279 d , m float64
287280 )
288281
289- for i := 0 ; i < len (c .c ); i ++ {
290- if l = len (c .b [i ]); l == 0 {
291- continue
292- }
293-
294- c .m [i ] = make ([]float64 , len (c .d [0 ]))
295-
296- for j := 0 ; j < l ; j ++ {
297- floats .Add (c .m [i ], c .d [c.b [i ][j ]])
298- }
299-
300- floats .Scale (1 / float64 (l ), c .m [i ])
301-
302- c .b [i ] = c .b [i ][:0 ]
282+ for i := 0 ; i < c .number ; i ++ {
283+ c .b [i ] = 0
303284 }
304285
305286 for i := 0 ; i < len (c .d ); i ++ {
306287 m = c .distance (c .d [i ], c .m [0 ])
307288 n = 0
308289
309- for j := 1 ; j < len ( c . c ) ; j ++ {
290+ for j := 1 ; j < c . number ; j ++ {
310291 if d = c .distance (c .d [i ], c .m [j ]); d < m {
311292 m = d
312293 n = j
313294 }
314295 }
315296
316- if v , ok := c .a [i ]; ok && v != n {
297+ if c .a [i ] != n {
317298 c .changes ++
318299 }
319300
320301 c .a [i ] = n
321- c .b [n ] = append (c .b [n ], i )
302+ c .b [n ]++
303+
304+ floats .Add (c .n [n ], c .d [i ])
322305 }
323- }
324306
325- func (c * kmeansClusterer ) notConverged () bool {
326- if c .counter == c .threshold {
327- return false
307+ for i := 0 ; i < c .number ; i ++ {
308+ floats .Scale (1 / float64 (c .b [i ]), c .n [i ])
309+
310+ for j := 0 ; j < l ; j ++ {
311+ c.m [i ][j ] = c.n [i ][j ]
312+ c.n [i ][j ] = 0
313+ }
328314 }
315+ }
329316
317+ func (c * kmeansClusterer ) check () {
330318 if c .changes == c .oldchanges {
331319 c .counter ++
332320 }
333321
334322 c .oldchanges = c .changes
335-
336- return true
337323}
0 commit comments