Skip to content

Commit 3506968

Browse files
author
Andrey Oskin
committed
YingYang, release candidate
1 parent 9bdfae2 commit 3506968

File tree

4 files changed

+131
-54
lines changed

4 files changed

+131
-54
lines changed

docs/src/index.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,14 @@ git checkout experimental
5656
- [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster).
5757
- [X] Interface for inclusion in Alan Turing Institute's [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl#who-is-this-repo-for).
5858
- [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
59+
- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means
60+
with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf)
5961
- [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
6062
- [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
6163
- [ ] Native support for tabular data inputs outside of MLJModels' interface.
6264
- [ ] Refactoring and finalizaiton of API desgin.
6365
- [ ] GPU support.
66+
- [ ] Distributed calculations support.
6467
- [ ] Implementation of other K-Means algorithm variants based on recent literature.
6568
- [ ] Optimization of code base.
6669
- [ ] Improved Documentation
@@ -103,6 +106,7 @@ r.converged # whether the procedure converged
103106
- [Lloyd()](https://cs.nyu.edu/~roweis/csc2515-2006/readings/lloyd57.pdf)
104107
- [Hamerly()](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster)
105108
- [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf)
109+
- [YingYang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf)
106110
- [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
107111
- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
108112

@@ -174,8 +178,10 @@ ________________________________________________________________________________
174178

175179
- 0.1.0 Initial release.
176180
- 0.1.1 Added interface for MLJ.
177-
- 0.1.2 Added Elkan algorithm.
181+
- 0.1.2 Added `Elkan` algorithm.
178182
- 0.1.3 Faster & optimized execution.
183+
- 0.1.4 Bug fixes
184+
- 0.1.5 Added `YingYang` algorithm.
179185

180186
## Contributing
181187

src/yingyang.jl

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,30 @@ YingYang algorithm implementation, based on "Yufei Ding et al. 2015. Yinyang K-M
55
Replacement of the Classic K-Means with Consistent Speedup. Proceedings of the 32nd International
66
Conference on Machine Learning, ICML 2015, Lille, France, 6-11 July 2015"
77
8+
Generally it outperform `Hamerly` algorithm and has roughly the same time as `Elkan`
9+
algorithm with much lower memory consumption.
10+
811
It can be used directly in `kmeans` function
912
1013
```julia
1114
X = rand(30, 100_000) # 100_000 random points in 30 dimensions
1215
1316
kmeans(YingYang(), X, 3) # 3 clusters, YingYang algorithm
1417
```
18+
19+
`YingYang` supports following arguments:
20+
`auto`: `Bool`, indicates whether to perform automated or manual grouping
21+
`group_size`: `Int`, estimation of average number of clusters per group. Lower numbers
22+
corresponds to higher calculation speed and higher memory consumption and vice versa.
1523
"""
1624
struct YingYang <: AbstractKMeansAlg
1725
auto::Bool
18-
divider::Int
26+
group_size::Int
1927
end
2028

2129
YingYang() = YingYang(true, 7)
2230
YingYang(auto::Bool) = YingYang(auto, 7)
23-
YingYang(divider::Int) = YingYang(true, divider)
31+
YingYang(group_size::Int) = YingYang(true, group_size)
2432

2533
function kmeans!(alg::YingYang, containers, X, k;
2634
n_threads = Threads.nthreads(),
@@ -98,7 +106,7 @@ function create_containers(alg::YingYang, k, nrow, ncol, n_threads)
98106
end
99107

100108
if alg.auto
101-
t = k ÷ alg.divider
109+
t = k ÷ alg.group_size
102110
t = t < 1 ? 1 : t
103111
else
104112
t = 1

test/test00_yingyang.jl

Lines changed: 0 additions & 50 deletions
This file was deleted.

test/test06_yingyang.jl

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
module TestYingYang
2+
3+
using ParallelKMeans
4+
using Test
5+
using Random
6+
7+
@testset "basic kmeans yingyang" begin
8+
X = [1. 2. 4.;]
9+
res = kmeans(YingYang(false), X, 1; n_threads = 1, tol = 1e-6, verbose = false)
10+
@test res.assignments == [1, 1, 1]
11+
@test res.centers[1] 2.3333333333333335
12+
@test res.totalcost 4.666666666666666
13+
@test res.converged
14+
15+
res = kmeans(YingYang(false), X, 2; n_threads = 1, init = [1.0 4.0], tol = 1e-6, verbose = false)
16+
@test res.assignments == [1, 1, 2]
17+
@test res.centers [1.5 4.0]
18+
@test res.totalcost 0.5
19+
@test res.converged
20+
end
21+
22+
@testset "yingyang no convergence yield last result" begin
23+
X = [1. 2. 4.;]
24+
res = kmeans(YingYang(false), X, 2; n_threads = 1, init = [1.0 4.0], tol = 1e-6, max_iters = 1, verbose = false)
25+
@test !res.converged
26+
@test res.totalcost 0.5
27+
end
28+
29+
@testset "yingyang singlethread linear separation" begin
30+
Random.seed!(2020)
31+
32+
X = rand(3, 100)
33+
res = kmeans(YingYang(false), X, 3; n_threads = 1, tol = 1e-10, max_iters = 10, verbose = false)
34+
35+
@test res.totalcost 14.16198704459199
36+
@test !res.converged
37+
@test res.iterations == 10
38+
end
39+
40+
@testset "yingyang multithread linear separation quasi two threads" begin
41+
Random.seed!(2020)
42+
43+
X = rand(3, 100)
44+
res = kmeans(YingYang(false), X, 3; n_threads = 2, tol = 1e-6, verbose = false)
45+
46+
@test res.totalcost 14.16198704459199
47+
@test res.converged
48+
end
49+
50+
@testset "yingyang different modes" begin
51+
Random.seed!(2020)
52+
X = rand(3, 100)
53+
init = ParallelKMeans.smart_init(X, 20).centroids
54+
baseline = kmeans(Lloyd(), X, 20, init = init, tol = 1e-10, n_threads = 1, verbose = false, max_iters = 1000)
55+
56+
res = kmeans(YingYang(false), X, 20, init = init, tol = 1e-10, n_threads = 1, verbose = false, max_iters = 1000)
57+
@test res.converged
58+
@test res.totalcost baseline.totalcost
59+
@test res.assignments == baseline.assignments
60+
@test res.centers baseline.centers
61+
@test res.iterations == baseline.iterations
62+
63+
res = kmeans(YingYang(), X, 20, init = init, tol = 1e-10, n_threads = 1, verbose = false, max_iters = 1000)
64+
@test res.converged
65+
@test res.totalcost baseline.totalcost
66+
@test res.assignments == baseline.assignments
67+
@test res.centers baseline.centers
68+
@test res.iterations == baseline.iterations
69+
70+
res = kmeans(YingYang(10), X, 20, init = init, tol = 1e-10, n_threads = 1, verbose = false, max_iters = 1000)
71+
@test res.converged
72+
@test res.totalcost baseline.totalcost
73+
@test res.assignments == baseline.assignments
74+
@test res.centers baseline.centers
75+
@test res.iterations == baseline.iterations
76+
77+
res = kmeans(YingYang(7), X, 20, init = init, tol = 1e-10, n_threads = 1, verbose = false, max_iters = 1000)
78+
@test res.converged
79+
@test res.totalcost baseline.totalcost
80+
@test res.assignments == baseline.assignments
81+
@test res.centers baseline.centers
82+
@test res.iterations == baseline.iterations
83+
84+
res = kmeans(YingYang(false), X, 20, init = init, tol = 1e-10, n_threads = 2, verbose = false, max_iters = 1000)
85+
@test res.converged
86+
@test res.totalcost baseline.totalcost
87+
@test res.assignments == baseline.assignments
88+
@test res.centers baseline.centers
89+
@test res.iterations == baseline.iterations
90+
91+
res = kmeans(YingYang(), X, 20, init = init, tol = 1e-10, n_threads = 2, verbose = false, max_iters = 1000)
92+
@test res.converged
93+
@test res.totalcost baseline.totalcost
94+
@test res.assignments == baseline.assignments
95+
@test res.centers baseline.centers
96+
@test res.iterations == baseline.iterations
97+
98+
res = kmeans(YingYang(10), X, 20, init = init, tol = 1e-10, n_threads = 2, verbose = false, max_iters = 1000)
99+
@test res.converged
100+
@test res.totalcost baseline.totalcost
101+
@test res.assignments == baseline.assignments
102+
@test res.centers baseline.centers
103+
@test res.iterations == baseline.iterations
104+
105+
res = kmeans(YingYang(7), X, 20, init = init, tol = 1e-10, n_threads = 2, verbose = false, max_iters = 1000)
106+
@test res.converged
107+
@test res.totalcost baseline.totalcost
108+
@test res.assignments == baseline.assignments
109+
@test res.centers baseline.centers
110+
@test res.iterations == baseline.iterations
111+
end
112+
113+
end # module

0 commit comments

Comments
 (0)