Skip to content

Commit ce9bb69

Browse files
committed
Created the entity of an Estimator - object allowing to find the optimal number of clusters. The solution is bases on the gap statistic method proposed by Tibshirani et al.
1 parent c76d603 commit ce9bb69

7 files changed

Lines changed: 294 additions & 95 deletions

File tree

clusters.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
package clusters
22

33
import (
4-
"math"
5-
64
"gonum.org/v1/gonum/floats"
75
)
86

@@ -30,11 +28,10 @@ type TestResult struct {
3028
clusters, expected int
3129
}
3230

33-
/* Clusterer denotes a operations of learning and testing
31+
/* Clusterer denotes the operation of learning
3432
* common for both Hard and Soft clusterers */
3533
type Clusterer interface {
3634
Learn([][]float64) error
37-
Test([][]float64, ...interface{}) (*TestResult, error)
3835
}
3936

4037
/* HardClusterer defines a set of operations for hard clustering algorithms */
@@ -84,12 +81,19 @@ type SoftClusterer interface {
8481
Clusterer
8582
}
8683

84+
type Estimator interface {
85+
86+
/* Estimates the numer of clusters */
87+
Estimate([][]float64) (int, error)
88+
}
89+
8790
var (
8891
EuclideanDistance = func(a, b []float64) float64 {
8992
return floats.Distance(a, b, 2)
9093
}
9194

9295
EuclideanDistanceSquared = func(a, b []float64) float64 {
93-
return math.Pow(floats.Distance(a, b, 2), 2)
96+
t := floats.Distance(a, b, 2)
97+
return t * t
9498
}
9599
)

dbscan.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,6 @@ func (c *dbscanClusterer) Learn(data [][]float64) error {
102102
return nil
103103
}
104104

105-
func (c *dbscanClusterer) Test(data [][]float64, args ...interface{}) (*TestResult, error) {
106-
if len(data) == 0 {
107-
return nil, ErrEmptySet
108-
}
109-
110-
return nil, nil
111-
}
112-
113105
func (c *dbscanClusterer) Sizes() []int {
114106
c.mu.RLock()
115107
defer c.mu.RUnlock()

errors.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ package clusters
33
import "errors"
44

55
var (
6-
ErrEmptySet = errors.New("Empty training set")
7-
ErrNotTrained = errors.New("You need to train the algorithm first")
8-
ErrZeroIterations = errors.New("Number of iterations cannot be less than 1")
9-
ErrOneCluster = errors.New("Number of clusters cannot be less than 2")
10-
ErrZeroEpsilon = errors.New("Epsilon cannot be 0")
11-
ErrZeroMinpts = errors.New("MinPts cannot be 0")
12-
ErrZeroWorkers = errors.New("Number of workers cannot be less than 0")
13-
ErrZeroXi = errors.New("Xi cannot be 0")
14-
ErrInvalidRange = errors.New("Range is invalid")
6+
ErrEmptySet = errors.New("Empty training set")
7+
ErrNotTrained = errors.New("You need to train the algorithm first")
8+
ErrZeroIterations = errors.New("Number of iterations cannot be less than 1")
9+
ErrOneCluster = errors.New("Number of clusters cannot be less than 2")
10+
ErrZeroEpsilon = errors.New("Epsilon cannot be 0")
11+
ErrZeroMinpts = errors.New("MinPts cannot be 0")
12+
ErrZeroWorkers = errors.New("Number of workers cannot be less than 0")
13+
ErrZeroXi = errors.New("Xi cannot be 0")
14+
ErrInvalidRange = errors.New("Range is invalid")
15+
ErrTestingNotSupported = errors.New("Testing is not supported for this algorithm")
1516
)

kmeans.go

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package clusters
22

33
import (
4-
"errors"
54
"math"
65
"math/rand"
76
"sync"
@@ -109,52 +108,6 @@ func (c *kmeansClusterer) Learn(data [][]float64) error {
109108
return nil
110109
}
111110

112-
func (c *kmeansClusterer) Test(data [][]float64, args ...interface{}) (*TestResult, error) {
113-
if len(data) == 0 {
114-
return nil, ErrEmptySet
115-
}
116-
117-
clusters, ok := args[0].(int)
118-
if !ok {
119-
return nil, errors.New("Argument #0 is invalid")
120-
}
121-
122-
var (
123-
size = len(data)
124-
bounds = bounds(data)
125-
wks = make([]float64, clusters)
126-
wkbs = make([]float64, clusters)
127-
sk = make([]float64, clusters)
128-
one = make([]float64, clusters)
129-
bwkbs = make([]float64, clusters)
130-
)
131-
132-
for i := 0; i < clusters; i++ {
133-
c.Learn(data)
134-
135-
wks[i] = math.Log(wk(c.d, c.m, c.a))
136-
137-
for j := 0; j < clusters; j++ {
138-
c.Learn(c.buildRandomizedSet(size, bounds))
139-
140-
bwkbs[j] = math.Log(wk(c.d, c.m, c.a))
141-
one[j] = 1
142-
}
143-
144-
wkbs[i] = floats.Sum(bwkbs) / float64(clusters)
145-
146-
floats.Scale(wkbs[i], one)
147-
floats.Sub(bwkbs, one)
148-
floats.Mul(bwkbs, bwkbs)
149-
150-
sk[i] = math.Sqrt(floats.Sum(bwkbs) / float64(clusters))
151-
}
152-
153-
floats.Scale(math.Sqrt(1+(1/float64(clusters))), sk)
154-
155-
return nil, nil
156-
}
157-
158111
func (c *kmeansClusterer) Sizes() []int {
159112
c.mu.RLock()
160113
defer c.mu.RUnlock()
@@ -371,20 +324,3 @@ func (c *kmeansClusterer) check() {
371324

372325
c.oldchanges = c.changes
373326
}
374-
375-
func (c *kmeansClusterer) buildRandomizedSet(size int, bounds []*[2]float64) [][]float64 {
376-
var (
377-
l = len(bounds)
378-
r = make([][]float64, size)
379-
)
380-
381-
for i := 0; i < size; i++ {
382-
r[i] = make([]float64, l)
383-
384-
for j := 0; j < l; j++ {
385-
r[i][j] = uniform(bounds[j])
386-
}
387-
}
388-
389-
return r
390-
}

0 commit comments

Comments
 (0)