making kmeans take AbstractMatrix instead of Matrix (#138)

- also the type constraints for the other `kmeans()` params and corresponding internal functions - some source formatting cosmetics - some broadcast to loop conversion to improve performance fixes #136 see also #129
JuliaStats · Jan 29, 2019 · eaae8dd · eaae8dd
1 parent da391f4
commit eaae8dd
Show file tree

Hide file tree

Showing 4 changed files with 193 additions and 135 deletions.
diff --git a/src/kmeans.jl b/src/kmeans.jl
@@ -2,7 +2,7 @@
 
 #### Interface
 
-mutable struct KmeansResult{T<:AbstractFloat} <: ClusteringResult
+mutable struct KmeansResult{T<:Real} <: ClusteringResult
     centers::Matrix{T}         # cluster centers (d x k)
     assignments::Vector{Int}   # assignments (n)
     costs::Vector{T}           # costs of the resultant assignments (n)
@@ -18,35 +18,34 @@ const _kmeans_default_maxiter = 100
 const _kmeans_default_tol = 1.0e-6
 const _kmeans_default_display = :none
 
-function kmeans!(X::Matrix{T}, centers::Matrix{T};
-                 weights=nothing,
+function kmeans!(X::AbstractMatrix{T}, centers::AbstractMatrix{T};
+                 weights::Union{Nothing, AbstractVector{<:Real}}=nothing,
                  maxiter::Integer=_kmeans_default_maxiter,
                  tol::Real=_kmeans_default_tol,
                  display::Symbol=_kmeans_default_display,
-                 distance::SemiMetric=SqEuclidean()) where T<:AbstractFloat
+                 distance::SemiMetric=SqEuclidean()) where T<:Real
 
     m, n = size(X)
     m2, k = size(centers)
     m == m2 || throw(DimensionMismatch("Inconsistent array dimensions."))
     (2 <= k < n) || error("k must have 2 <= k < n.")
 
-    assignments = zeros(Int, n)
-    costs = zeros(T, n)
+    assignments = Vector{Int}(undef, n)
+    costs = Vector{T}(undef, n)
     counts = Vector{Int}(undef, k)
     cweights = Vector{Float64}(undef, k)
 
     _kmeans!(X, conv_weights(T, n, weights), centers,
-             assignments, costs, counts, cweights,
-             round(Int, maxiter), tol, display_level(display), distance)
+             assignments, costs, counts, cweights, round(Int, maxiter), tol,
+             display_level(display), distance)
 end
 
-function kmeans(X::Matrix{T}, k::Int;
-                weights=nothing,
-                init=_kmeans_default_init,
+function kmeans(X::AbstractMatrix{<:Real}, k::Int;
+                weights=nothing, init=_kmeans_default_init,
                 maxiter::Integer=_kmeans_default_maxiter,
                 tol::Real=_kmeans_default_tol,
                 display::Symbol=_kmeans_default_display,
-                distance::SemiMetric=SqEuclidean()) where T<:AbstractFloat
+                distance::SemiMetric=SqEuclidean())
 
     m, n = size(X)
     (2 <= k < n) || error("k must have 2 <= k < n.")
@@ -64,27 +63,27 @@ end
 
 # core k-means skeleton
 function _kmeans!(
-    x::Matrix{T},                   # in: sample matrix (d x n)
-    w::Union{Nothing, Vector{T}},      # in: sample weights (n)
-    centers::Matrix{T},             # in/out: matrix of centers (d x k)
-    assignments::Vector{Int},       # out: vector of assignments (n)
-    costs::Vector{T},               # out: costs of the resultant assignments (n)
-    counts::Vector{Int},            # out: the number of samples assigned to each cluster (k)
-    cweights::Vector{Float64},      # out: the weights of each cluster
-    maxiter::Int,                   # in: maximum number of iterations
-    tol::Real,                      # in: tolerance of change at convergence
-    displevel::Int,                 # in: the level of display
-    distance::SemiMetric) where T<:AbstractFloat             # in: function to calculate the distance with
+    X::AbstractMatrix{T},         # in: sample matrix (d x n)
+    w::Union{Nothing, AbstractVector{<:Real}}, # in: sample weights (n)
+    centers::AbstractMatrix{T},   # in/out: matrix of centers (d x k)
+    assignments::Vector{Int},     # out: vector of assignments (n)
+    costs::Vector{T},             # out: costs of the resultant assignments (n)
+    counts::Vector{Int},          # out: # samples assigned to each cluster (k)
+    cweights::Vector{Float64},    # out: weights of each cluster
+    maxiter::Integer,             # in: maximum number of iterations
+    tol::Real,                    # in: tolerance of change at convergence
+    displevel::Int,               # in: the level of display
+    distance::SemiMetric          # in: function to calculate the distance with
+    ) where T<:Real
 
     # initialize
-
     k = size(centers, 2)
     to_update = Vector{Bool}(undef, k) # indicators of whether a center needs to be updated
-    unused = Int[]
+    unused = Vector{Int}()
     num_affected::Int = k # number of centers, to which the distances need to be recomputed
 
-    dmat = pairwise(distance, centers, x)
-    dmat = convert(Array{T}, dmat) #Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T
+    dmat = pairwise(distance, centers, X)
+    dmat = convert(Array{T}, dmat) # Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T
     update_assignments!(dmat, true, assignments, costs, counts, to_update, unused)
     objv = w === nothing ? sum(costs) : dot(w, costs)
 
@@ -101,35 +100,33 @@ function _kmeans!(
         t = t + 1
 
         # update (affected) centers
-
-        update_centers!(x, w, assignments, to_update, centers, cweights)
+        update_centers!(X, w, assignments, to_update, centers, cweights)
 
         if !isempty(unused)
-            repick_unused_centers(x, costs, centers, unused)
+            repick_unused_centers(X, costs, centers, unused)
         end
 
         # update pairwise distance matrix
-
         if !isempty(unused)
             to_update[unused] .= true
         end
 
         if t == 1 || num_affected > 0.75 * k
-            pairwise!(dmat, distance, centers, x)
+            pairwise!(dmat, distance, centers, X)
         else
             # if only a small subset is affected, only compute for that subset
             affected_inds = findall(to_update)
-            dmat_p = pairwise(distance, centers[:, affected_inds], x)
+            dmat_p = pairwise(distance, centers[:, affected_inds], X)
             dmat[affected_inds, :] .= dmat_p
         end
 
         # update assignments
 
         update_assignments!(dmat, false, assignments, costs, counts, to_update, unused)
+
         num_affected = sum(to_update) + length(unused)
 
         # compute change of objective and determine convergence
-
         prev_objv = objv
         objv = w === nothing ? sum(costs) : dot(w, costs)
         objv_change = objv - prev_objv
@@ -157,8 +154,8 @@ function _kmeans!(
         end
     end
 
-    return KmeansResult(centers, assignments, costs, counts, cweights,
-                        Float64(objv), t, converged)
+    return KmeansResult(convert(Matrix, centers), assignments, costs, counts,
+                        cweights, Float64(objv), t, converged)
 end
 
 
@@ -167,15 +164,16 @@ end
 #  an updated (squared) distance matrix
 #
 function update_assignments!(
-    dmat::Matrix{T},            # in:  distance matrix (k x n)
-    is_init::Bool,              # in:  whether it is the initial run
-    assignments::Vector{Int},   # out: assignment vector (n)
-    costs::Vector{T},           # out: costs of the resultant assignment (n)
-    counts::Vector{Int},        # out: number of samples assigned to each cluster (k)
-    to_update::Vector{Bool},    # out: whether a center needs update (k)
-    unused::Vector{Int}) where T<:AbstractFloat        # out: the list of centers get no samples assigned to it
+    dmat::Matrix{T},              # in:  distance matrix (k x n)
+    is_init::Bool,                # in:  whether it is the initial run
+    assignments::Vector{Int},     # out: assignment vector (n)
+    costs::Vector{T},             # out: costs of the resultant assignment (n)
+    counts::Vector{Int},          # out: # samples assigned to each cluster (k)
+    to_update::Vector{Bool},      # out: whether a center needs update (k)
+    unused::Vector{Int}          # out: list of centers with no samples
+    ) where T<:Real
 
-    k::Int, n::Int = size(dmat)
+    k, n = size(dmat)
 
     # re-initialize the counting vector
     fill!(counts, 0)
@@ -190,12 +188,12 @@ function update_assignments!(
     end
 
     # process each sample
-    @inbounds for j = 1 : n
+    @inbounds for j = 1:n
 
         # find the closest cluster to the i-th sample
-        a::Int = 1
-        c::T = dmat[1, j]
-        for i = 2 : k
+        a = 1
+        c = dmat[1, j]
+        for i = 2:k
             ci = dmat[i, j]
             if ci < c
                 a = i
@@ -224,7 +222,7 @@ function update_assignments!(
 
     # look for centers that have no associated samples
 
-    for i = 1 : k
+    for i = 1:k
         if counts[i] == 0
             push!(unused, i)
             to_update[i] = false # this is handled using different mechanism
@@ -238,49 +236,43 @@ end
 #  (specific to the case where samples are not weighted)
 #
 function update_centers!(
-    x::Matrix{T},                   # in: sample matrix (d x n)
-    w::Nothing,                        # in: sample weights
-    assignments::Vector{Int},       # in: assignments (n)
-    to_update::Vector{Bool},        # in: whether a center needs update (k)
-    centers::Matrix{T},             # out: updated centers (d x k)
-    cweights::Vector) where T<:AbstractFloat               # out: updated cluster weights (k)
-
-    d::Int = size(x, 1)
-    n::Int = size(x, 2)
-    k::Int = size(centers, 2)
+    X::AbstractMatrix{T},            # in: sample matrix (d x n)
+    w::Nothing,                      # in: sample weights
+    assignments::Vector{Int},        # in: assignments (n)
+    to_update::Vector{Bool},         # in: whether a center needs update (k)
+    centers::AbstractMatrix{T},      # out: updated centers (d x k)
+    cweights::Vector{Float64}        # out: updated cluster weights (k)
+    ) where T<:Real
+
+    d, n = size(X)
+    k = size(centers, 2)
 
     # initialize center weights
-    for i = 1 : k
-        if to_update[i]
-            cweights[i] = 0.
-        end
-    end
+    cweights[to_update] .= 0.0
 
     # accumulate columns
-    @inbounds for j = 1 : n
+    @inbounds for j = 1:n
         cj = assignments[j]
         1 <= cj <= k || error("assignment out of boundary.")
         if to_update[cj]
             if cweights[cj] > 0
                 for i = 1:d
-                    centers[i, cj] += x[i, j]
+                    centers[i, cj] += X[i, j]
                 end
             else
                 for i = 1:d
-                    centers[i, cj] = x[i, j]
+                    centers[i, cj] = X[i, j]
                 end
             end
             cweights[cj] += 1
         end
     end
 
     # sum ==> mean
-    for j = 1:k
+    @inbounds for j = 1:k
         if to_update[j]
-            @inbounds cj::T = 1 / cweights[j]
-            vj = view(centers,:,j)
             for i = 1:d
-                @inbounds vj[i] *= cj
+                centers[i, j] /= cweights[j]
             end
         end
     end
@@ -292,47 +284,47 @@ end
 #  (specific to the case where samples are weighted)
 #
 function update_centers!(
-    x::Matrix{T},                   # in: sample matrix (d x n)
-    weights::Vector{T},             # in: sample weights (n)
-    assignments::Vector{Int},       # in: assignments (n)
-    to_update::Vector{Bool},        # in: whether a center needs update (k)
-    centers::Matrix{T},             # out: updated centers (d x k)
-    cweights::Vector                # out: updated cluster weights (k)
-) where T<:AbstractFloat
-
-    d::Int = size(x, 1)
-    n::Int = size(x, 2)
-    k::Int = size(centers, 2)
+    X::AbstractMatrix{T},            # in: sample matrix (d x n)
+    weights::AbstractVector{<:Real}, # in: sample weights (n)
+    assignments::Vector{Int},        # in: assignments (n)
+    to_update::Vector{Bool},         # in: whether a center needs update (k)
+    centers::AbstractMatrix{T},      # out: updated centers (d x k)
+    cweights::Vector{Float64}        # out: updated cluster weights (k)
+    ) where T<:Real
+
+    d, n = size(X)
+    k = size(centers, 2)
 
     # initialize center weights
     cweights[to_update] .= 0.0
 
     # accumulate columns
-    # accumulate_cols_u!(centers, cweights, x, assignments, weights, to_update)
-    for j = 1 : n
-        @inbounds wj = weights[j]
-
+    @inbounds for j = 1:n
+        wj = weights[j]
         if wj > 0
-            @inbounds cj = assignments[j]
+            cj = assignments[j]
             1 <= cj <= k || error("assignment out of boundary.")
-
             if to_update[cj]
-                rj = view(centers, :, cj)
-                xj = view(x, :, j)
                 if cweights[cj] > 0
-                    @inbounds rj .+= xj * wj
+                    for i = 1:d
+                        centers[i, cj] += X[i, j] * wj
+                    end
                 else
-                    @inbounds rj .= xj * wj
+                    for i = 1:d
+                        centers[i, cj] = X[i, j] * wj
+                    end
                 end
                 cweights[cj] += wj
             end
         end
     end
 
     # sum ==> mean
-    for j = 1:k
+    @inbounds for j = 1:k
         if to_update[j]
-            @inbounds centers[:, j] .*= 1 / cweights[j]
+            for i = 1:d
+                centers[i, j] /= cweights[j]
+            end
         end
     end
 end
@@ -342,23 +334,24 @@ end
 #  Re-picks centers that get no samples assigned to them.
 #
 function repick_unused_centers(
-    x::Matrix{T},           # in: the sample set (d x n)
-    costs::Vector{T},       # in: the current assignment costs (n)
-    centers::Matrix{T},     # to be updated: the centers (d x k)
-    unused::Vector{Int}) where T<:AbstractFloat    # in: the set of indices of centers to be updated
+    X::AbstractMatrix{T},       # in: the sample set (d x n)
+    costs::Vector{T},           # in: the current assignment costs (n)
+    centers::AbstractMatrix{T}, # to be updated: the centers (d x k)
+    unused::Vector{Int}         # in: set of indices of centers to be updated
+    ) where T<:Real
 
     # pick new centers using a scheme like kmeans++
     ds = similar(costs)
     tcosts = copy(costs)
-    n = size(x, 2)
+    n = size(X, 2)
 
     for i in unused
         j = wsample(1:n, tcosts)
         tcosts[j] = 0
-        v = x[:,j]
-        centers[:,i] = v
+        v = view(X, :, j)
+        centers[:, i] = v
 
-        colwise!(ds, SqEuclidean(), v, x)
+        colwise!(ds, SqEuclidean(), v, X)
         tcosts = min(tcosts, ds)
     end
 end