Skip to content

Commit

Permalink
making kmeans take AbstractMatrix instead of Matrix (#138)
Browse files Browse the repository at this point in the history
- also the type constraints for the other `kmeans()` params and corresponding internal functions
- some source formatting cosmetics
- some broadcast to loop conversion to improve performance

fixes #136
see also #129
  • Loading branch information
tlienart authored and alyst committed Jan 29, 2019
1 parent da391f4 commit eaae8dd
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 135 deletions.
197 changes: 95 additions & 102 deletions src/kmeans.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#### Interface

mutable struct KmeansResult{T<:AbstractFloat} <: ClusteringResult
mutable struct KmeansResult{T<:Real} <: ClusteringResult
centers::Matrix{T} # cluster centers (d x k)
assignments::Vector{Int} # assignments (n)
costs::Vector{T} # costs of the resultant assignments (n)
Expand All @@ -18,35 +18,34 @@ const _kmeans_default_maxiter = 100
const _kmeans_default_tol = 1.0e-6
const _kmeans_default_display = :none

function kmeans!(X::Matrix{T}, centers::Matrix{T};
weights=nothing,
function kmeans!(X::AbstractMatrix{T}, centers::AbstractMatrix{T};
weights::Union{Nothing, AbstractVector{<:Real}}=nothing,
maxiter::Integer=_kmeans_default_maxiter,
tol::Real=_kmeans_default_tol,
display::Symbol=_kmeans_default_display,
distance::SemiMetric=SqEuclidean()) where T<:AbstractFloat
distance::SemiMetric=SqEuclidean()) where T<:Real

m, n = size(X)
m2, k = size(centers)
m == m2 || throw(DimensionMismatch("Inconsistent array dimensions."))
(2 <= k < n) || error("k must have 2 <= k < n.")

assignments = zeros(Int, n)
costs = zeros(T, n)
assignments = Vector{Int}(undef, n)
costs = Vector{T}(undef, n)
counts = Vector{Int}(undef, k)
cweights = Vector{Float64}(undef, k)

_kmeans!(X, conv_weights(T, n, weights), centers,
assignments, costs, counts, cweights,
round(Int, maxiter), tol, display_level(display), distance)
assignments, costs, counts, cweights, round(Int, maxiter), tol,
display_level(display), distance)
end

function kmeans(X::Matrix{T}, k::Int;
weights=nothing,
init=_kmeans_default_init,
function kmeans(X::AbstractMatrix{<:Real}, k::Int;
weights=nothing, init=_kmeans_default_init,
maxiter::Integer=_kmeans_default_maxiter,
tol::Real=_kmeans_default_tol,
display::Symbol=_kmeans_default_display,
distance::SemiMetric=SqEuclidean()) where T<:AbstractFloat
distance::SemiMetric=SqEuclidean())

m, n = size(X)
(2 <= k < n) || error("k must have 2 <= k < n.")
Expand All @@ -64,27 +63,27 @@ end

# core k-means skeleton
function _kmeans!(
x::Matrix{T}, # in: sample matrix (d x n)
w::Union{Nothing, Vector{T}}, # in: sample weights (n)
centers::Matrix{T}, # in/out: matrix of centers (d x k)
assignments::Vector{Int}, # out: vector of assignments (n)
costs::Vector{T}, # out: costs of the resultant assignments (n)
counts::Vector{Int}, # out: the number of samples assigned to each cluster (k)
cweights::Vector{Float64}, # out: the weights of each cluster
maxiter::Int, # in: maximum number of iterations
tol::Real, # in: tolerance of change at convergence
displevel::Int, # in: the level of display
distance::SemiMetric) where T<:AbstractFloat # in: function to calculate the distance with
X::AbstractMatrix{T}, # in: sample matrix (d x n)
w::Union{Nothing, AbstractVector{<:Real}}, # in: sample weights (n)
centers::AbstractMatrix{T}, # in/out: matrix of centers (d x k)
assignments::Vector{Int}, # out: vector of assignments (n)
costs::Vector{T}, # out: costs of the resultant assignments (n)
counts::Vector{Int}, # out: # samples assigned to each cluster (k)
cweights::Vector{Float64}, # out: weights of each cluster
maxiter::Integer, # in: maximum number of iterations
tol::Real, # in: tolerance of change at convergence
displevel::Int, # in: the level of display
distance::SemiMetric # in: function to calculate the distance with
) where T<:Real

# initialize

k = size(centers, 2)
to_update = Vector{Bool}(undef, k) # indicators of whether a center needs to be updated
unused = Int[]
unused = Vector{Int}()
num_affected::Int = k # number of centers, to which the distances need to be recomputed

dmat = pairwise(distance, centers, x)
dmat = convert(Array{T}, dmat) #Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T
dmat = pairwise(distance, centers, X)
dmat = convert(Array{T}, dmat) # Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T
update_assignments!(dmat, true, assignments, costs, counts, to_update, unused)
objv = w === nothing ? sum(costs) : dot(w, costs)

Expand All @@ -101,35 +100,33 @@ function _kmeans!(
t = t + 1

# update (affected) centers

update_centers!(x, w, assignments, to_update, centers, cweights)
update_centers!(X, w, assignments, to_update, centers, cweights)

if !isempty(unused)
repick_unused_centers(x, costs, centers, unused)
repick_unused_centers(X, costs, centers, unused)
end

# update pairwise distance matrix

if !isempty(unused)
to_update[unused] .= true
end

if t == 1 || num_affected > 0.75 * k
pairwise!(dmat, distance, centers, x)
pairwise!(dmat, distance, centers, X)
else
# if only a small subset is affected, only compute for that subset
affected_inds = findall(to_update)
dmat_p = pairwise(distance, centers[:, affected_inds], x)
dmat_p = pairwise(distance, centers[:, affected_inds], X)
dmat[affected_inds, :] .= dmat_p
end

# update assignments

update_assignments!(dmat, false, assignments, costs, counts, to_update, unused)

num_affected = sum(to_update) + length(unused)

# compute change of objective and determine convergence

prev_objv = objv
objv = w === nothing ? sum(costs) : dot(w, costs)
objv_change = objv - prev_objv
Expand Down Expand Up @@ -157,8 +154,8 @@ function _kmeans!(
end
end

return KmeansResult(centers, assignments, costs, counts, cweights,
Float64(objv), t, converged)
return KmeansResult(convert(Matrix, centers), assignments, costs, counts,
cweights, Float64(objv), t, converged)
end


Expand All @@ -167,15 +164,16 @@ end
# an updated (squared) distance matrix
#
function update_assignments!(
dmat::Matrix{T}, # in: distance matrix (k x n)
is_init::Bool, # in: whether it is the initial run
assignments::Vector{Int}, # out: assignment vector (n)
costs::Vector{T}, # out: costs of the resultant assignment (n)
counts::Vector{Int}, # out: number of samples assigned to each cluster (k)
to_update::Vector{Bool}, # out: whether a center needs update (k)
unused::Vector{Int}) where T<:AbstractFloat # out: the list of centers get no samples assigned to it
dmat::Matrix{T}, # in: distance matrix (k x n)
is_init::Bool, # in: whether it is the initial run
assignments::Vector{Int}, # out: assignment vector (n)
costs::Vector{T}, # out: costs of the resultant assignment (n)
counts::Vector{Int}, # out: # samples assigned to each cluster (k)
to_update::Vector{Bool}, # out: whether a center needs update (k)
unused::Vector{Int} # out: list of centers with no samples
) where T<:Real

k::Int, n::Int = size(dmat)
k, n = size(dmat)

# re-initialize the counting vector
fill!(counts, 0)
Expand All @@ -190,12 +188,12 @@ function update_assignments!(
end

# process each sample
@inbounds for j = 1 : n
@inbounds for j = 1:n

# find the closest cluster to the i-th sample
a::Int = 1
c::T = dmat[1, j]
for i = 2 : k
a = 1
c = dmat[1, j]
for i = 2:k
ci = dmat[i, j]
if ci < c
a = i
Expand Down Expand Up @@ -224,7 +222,7 @@ function update_assignments!(

# look for centers that have no associated samples

for i = 1 : k
for i = 1:k
if counts[i] == 0
push!(unused, i)
to_update[i] = false # this is handled using different mechanism
Expand All @@ -238,49 +236,43 @@ end
# (specific to the case where samples are not weighted)
#
function update_centers!(
x::Matrix{T}, # in: sample matrix (d x n)
w::Nothing, # in: sample weights
assignments::Vector{Int}, # in: assignments (n)
to_update::Vector{Bool}, # in: whether a center needs update (k)
centers::Matrix{T}, # out: updated centers (d x k)
cweights::Vector) where T<:AbstractFloat # out: updated cluster weights (k)

d::Int = size(x, 1)
n::Int = size(x, 2)
k::Int = size(centers, 2)
X::AbstractMatrix{T}, # in: sample matrix (d x n)
w::Nothing, # in: sample weights
assignments::Vector{Int}, # in: assignments (n)
to_update::Vector{Bool}, # in: whether a center needs update (k)
centers::AbstractMatrix{T}, # out: updated centers (d x k)
cweights::Vector{Float64} # out: updated cluster weights (k)
) where T<:Real

d, n = size(X)
k = size(centers, 2)

# initialize center weights
for i = 1 : k
if to_update[i]
cweights[i] = 0.
end
end
cweights[to_update] .= 0.0

# accumulate columns
@inbounds for j = 1 : n
@inbounds for j = 1:n
cj = assignments[j]
1 <= cj <= k || error("assignment out of boundary.")
if to_update[cj]
if cweights[cj] > 0
for i = 1:d
centers[i, cj] += x[i, j]
centers[i, cj] += X[i, j]
end
else
for i = 1:d
centers[i, cj] = x[i, j]
centers[i, cj] = X[i, j]
end
end
cweights[cj] += 1
end
end

# sum ==> mean
for j = 1:k
@inbounds for j = 1:k
if to_update[j]
@inbounds cj::T = 1 / cweights[j]
vj = view(centers,:,j)
for i = 1:d
@inbounds vj[i] *= cj
centers[i, j] /= cweights[j]
end
end
end
Expand All @@ -292,47 +284,47 @@ end
# (specific to the case where samples are weighted)
#
function update_centers!(
x::Matrix{T}, # in: sample matrix (d x n)
weights::Vector{T}, # in: sample weights (n)
assignments::Vector{Int}, # in: assignments (n)
to_update::Vector{Bool}, # in: whether a center needs update (k)
centers::Matrix{T}, # out: updated centers (d x k)
cweights::Vector # out: updated cluster weights (k)
) where T<:AbstractFloat

d::Int = size(x, 1)
n::Int = size(x, 2)
k::Int = size(centers, 2)
X::AbstractMatrix{T}, # in: sample matrix (d x n)
weights::AbstractVector{<:Real}, # in: sample weights (n)
assignments::Vector{Int}, # in: assignments (n)
to_update::Vector{Bool}, # in: whether a center needs update (k)
centers::AbstractMatrix{T}, # out: updated centers (d x k)
cweights::Vector{Float64} # out: updated cluster weights (k)
) where T<:Real

d, n = size(X)
k = size(centers, 2)

# initialize center weights
cweights[to_update] .= 0.0

# accumulate columns
# accumulate_cols_u!(centers, cweights, x, assignments, weights, to_update)
for j = 1 : n
@inbounds wj = weights[j]

@inbounds for j = 1:n
wj = weights[j]
if wj > 0
@inbounds cj = assignments[j]
cj = assignments[j]
1 <= cj <= k || error("assignment out of boundary.")

if to_update[cj]
rj = view(centers, :, cj)
xj = view(x, :, j)
if cweights[cj] > 0
@inbounds rj .+= xj * wj
for i = 1:d
centers[i, cj] += X[i, j] * wj
end
else
@inbounds rj .= xj * wj
for i = 1:d
centers[i, cj] = X[i, j] * wj
end
end
cweights[cj] += wj
end
end
end

# sum ==> mean
for j = 1:k
@inbounds for j = 1:k
if to_update[j]
@inbounds centers[:, j] .*= 1 / cweights[j]
for i = 1:d
centers[i, j] /= cweights[j]
end
end
end
end
Expand All @@ -342,23 +334,24 @@ end
# Re-picks centers that get no samples assigned to them.
#
function repick_unused_centers(
x::Matrix{T}, # in: the sample set (d x n)
costs::Vector{T}, # in: the current assignment costs (n)
centers::Matrix{T}, # to be updated: the centers (d x k)
unused::Vector{Int}) where T<:AbstractFloat # in: the set of indices of centers to be updated
X::AbstractMatrix{T}, # in: the sample set (d x n)
costs::Vector{T}, # in: the current assignment costs (n)
centers::AbstractMatrix{T}, # to be updated: the centers (d x k)
unused::Vector{Int} # in: set of indices of centers to be updated
) where T<:Real

# pick new centers using a scheme like kmeans++
ds = similar(costs)
tcosts = copy(costs)
n = size(x, 2)
n = size(X, 2)

for i in unused
j = wsample(1:n, tcosts)
tcosts[j] = 0
v = x[:,j]
centers[:,i] = v
v = view(X, :, j)
centers[:, i] = v

colwise!(ds, SqEuclidean(), v, x)
colwise!(ds, SqEuclidean(), v, X)
tcosts = min(tcosts, ds)
end
end
Loading

0 comments on commit eaae8dd

Please sign in to comment.