Merge pull request #14 from PyDataBlog/experimental

PyDataBlog · web-flow · commit 35a29c2d5132 · 2020-02-26T14:16:56.000+01:00
Experimental (but stable) implementation on a single thread.
diff --git a/.travis.yml b/.travis.yml
@@ -8,7 +8,6 @@ julia:
   - nightly
 after_success:
   - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())'
-coveralls: true
 jobs:
   allow_failures:
     - julia: nightly
diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl
@@ -131,38 +131,38 @@ function smart_init(X::Array{Float64, 2}, k::Int, mode::T = SingleThread();
         rand_idx = rand(1:n_row)
         rand_indices[1] = rand_idx
         centroids[1, :] .= X[rand_idx, :]
+        centroids[k, :] .= 0.0
         distances = Array{Float64}(undef, n_row, 1)
         new_distances = Array{Float64}(undef, n_row, 1)
 
+        # TODO: Add `colwise` function (or use it from `Distances` package)
         # compute distances from the first centroid chosen to all the other data points
         first_centroid_matrix = convert(Matrix, centroids[1, :]')
 
         # flatten distances
-        # distances = vec(pairwise(SqEuclidean(), X, first_centroid_matrix, dims = 1))
         pairwise!(distances, X, first_centroid_matrix, mode)
+        distances[rand_idx] = 0.0
 
         for i = 2:k
             # choose the next centroid, the probability for each data point to be chosen
             # is directly proportional to its squared distance from the nearest centroid
-            r_idx = sample(1:n_row, ProbabilityWeights(vec(distances)))
+            r_idx = wsample(1:n_row, vec(distances))
             rand_indices[i] = r_idx
             centroids[i, :] .= X[r_idx, :]
 
-            # Ignore setting the last centroid to help the separation of centroids
-            if i == (k-1)
-                break
-            end
+            # no need for final distance update
+            i == k && break
 
             # compute distances from the centroids to all data points
             current_centroid_matrix = convert(Matrix, centroids[i, :]')
             # new_distances = vec(pairwise(SqEuclidean(), X, current_centroid_matrix, dims = 1))
             pairwise!(new_distances, X, first_centroid_matrix, mode)
 
             # and update the squared distance as the minimum distance to all centroid
-            # distances = minimum([distances, new_distances])
             for i in 1:n_row
                 distances[i, 1] = distances[i, 1] < new_distances[i, 1] ? distances[i, 1] : new_distances[i, 1]
             end
+            distances[r_idx, 1] = 0.0
         end
 
     else
diff --git a/test/test02_kmeans.jl b/test/test02_kmeans.jl
@@ -11,10 +11,7 @@ using Random
     X = rand(100, 3)
     labels, centroids, sum_squares = kmeans(X, 3; tol = 1e-10, verbose = false)
 
-    # for future reference: Clustering shows here 14.964882850452984
-    # guess they use better initialisation. For now we will use own
-    # value
-    @test sum_squares ≈ 15.314823028363763
+    @test sum_squares ≈ 14.964882850452971
 end
 
 
@@ -24,10 +21,7 @@ end
     X = rand(100, 3)
     labels, centroids, sum_squares = kmeans(X, 3, MultiThread(); tol = 1e-10, verbose = false)
 
-    # for future reference: Clustering shows here 14.964882850452984
-    # guess they use better initialisation. For now we will use own
-    # value
-    @test sum_squares ≈ 15.314823028363763
+    @test sum_squares ≈ 14.964882850452971
 end
 
 end # module