Skip to content

Commit 35a29c2

Browse files
authored
Merge pull request #14 from PyDataBlog/experimental
Experimental (but stable) implementation on a single thread.
2 parents ac87d1b + 70a7803 commit 35a29c2

File tree

3 files changed

+9
-16
lines changed

3 files changed

+9
-16
lines changed

.travis.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ julia:
88
- nightly
99
after_success:
1010
- julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())'
11-
coveralls: true
1211
jobs:
1312
allow_failures:
1413
- julia: nightly

src/ParallelKMeans.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,38 +131,38 @@ function smart_init(X::Array{Float64, 2}, k::Int, mode::T = SingleThread();
131131
rand_idx = rand(1:n_row)
132132
rand_indices[1] = rand_idx
133133
centroids[1, :] .= X[rand_idx, :]
134+
centroids[k, :] .= 0.0
134135
distances = Array{Float64}(undef, n_row, 1)
135136
new_distances = Array{Float64}(undef, n_row, 1)
136137

138+
# TODO: Add `colwise` function (or use it from `Distances` package)
137139
# compute distances from the first centroid chosen to all the other data points
138140
first_centroid_matrix = convert(Matrix, centroids[1, :]')
139141

140142
# flatten distances
141-
# distances = vec(pairwise(SqEuclidean(), X, first_centroid_matrix, dims = 1))
142143
pairwise!(distances, X, first_centroid_matrix, mode)
144+
distances[rand_idx] = 0.0
143145

144146
for i = 2:k
145147
# choose the next centroid, the probability for each data point to be chosen
146148
# is directly proportional to its squared distance from the nearest centroid
147-
r_idx = sample(1:n_row, ProbabilityWeights(vec(distances)))
149+
r_idx = wsample(1:n_row, vec(distances))
148150
rand_indices[i] = r_idx
149151
centroids[i, :] .= X[r_idx, :]
150152

151-
# Ignore setting the last centroid to help the separation of centroids
152-
if i == (k-1)
153-
break
154-
end
153+
# no need for final distance update
154+
i == k && break
155155

156156
# compute distances from the centroids to all data points
157157
current_centroid_matrix = convert(Matrix, centroids[i, :]')
158158
# new_distances = vec(pairwise(SqEuclidean(), X, current_centroid_matrix, dims = 1))
159159
pairwise!(new_distances, X, first_centroid_matrix, mode)
160160

161161
# and update the squared distance as the minimum distance to all centroid
162-
# distances = minimum([distances, new_distances])
163162
for i in 1:n_row
164163
distances[i, 1] = distances[i, 1] < new_distances[i, 1] ? distances[i, 1] : new_distances[i, 1]
165164
end
165+
distances[r_idx, 1] = 0.0
166166
end
167167

168168
else

test/test02_kmeans.jl

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@ using Random
1111
X = rand(100, 3)
1212
labels, centroids, sum_squares = kmeans(X, 3; tol = 1e-10, verbose = false)
1313

14-
# for future reference: Clustering shows here 14.964882850452984
15-
# guess they use better initialisation. For now we will use own
16-
# value
17-
@test sum_squares 15.314823028363763
14+
@test sum_squares 14.964882850452971
1815
end
1916

2017

@@ -24,10 +21,7 @@ end
2421
X = rand(100, 3)
2522
labels, centroids, sum_squares = kmeans(X, 3, MultiThread(); tol = 1e-10, verbose = false)
2623

27-
# for future reference: Clustering shows here 14.964882850452984
28-
# guess they use better initialisation. For now we will use own
29-
# value
30-
@test sum_squares 15.314823028363763
24+
@test sum_squares 14.964882850452971
3125
end
3226

3327
end # module

0 commit comments

Comments
 (0)