Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cluster streaming #13

Open
wants to merge 29 commits into
base: v_1_0_0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
bfe68f2
new comment format + streaming kmeans+dbscan
dmorgankx Aug 21, 2020
16a9314
new fit/predict/update structure
dmorgankx Sep 2, 2020
6dc1251
Merge https://github.com/KxSystems/ml into cluster_streaming
dmorgankx Sep 2, 2020
ca80d54
predict updates
dmorgankx Sep 10, 2020
b5371e8
hc cluster label fixes
dmorgankx Sep 10, 2020
f712cec
Merge branch 'v_1_0_0' of https://github.com/cmccarthy1/ml into clust…
dmorgankx Sep 10, 2020
3e594e2
hc fixes
dmorgankx Sep 10, 2020
ba3c9a6
convergence function for ap
dmorgankx Sep 11, 2020
ee8b3a7
formatting for ap, dbscan and hc
dmorgankx Sep 11, 2020
679e84a
added -1 cluster for non convergence
dmorgankx Sep 11, 2020
86d25fe
test changes
dmorgankx Sep 14, 2020
27ec6b4
Merge branch 'cluster_streaming' of https://github.com/dmorgankx/ml i…
Oct 1, 2020
7b54583
update to clustering code to add kmeans early stop and cleanup
Oct 5, 2020
a54bdc6
Merge pull request #43 from cmccarthy1/clustering_review
dmorgankx Oct 6, 2020
3f6ab7a
Merge branch 'v_1_0_0' of https://github.com/cmccarthy1/ml into clust…
dmorgankx Oct 6, 2020
fbe4b5b
fix to failing cluster util tests
dmorgankx Oct 6, 2020
798aca3
change of testing format
dmorgankx Oct 6, 2020
5c81ce3
new testing script
dmorgankx Oct 6, 2020
d7b442b
change to seed
dmorgankx Oct 6, 2020
bea876b
addition of failing tests
dmorgankx Oct 6, 2020
1fed4b2
scoring failing tests
dmorgankx Oct 6, 2020
93be66a
change of testing format
dmorgankx Oct 6, 2020
1df2700
util test fix
dmorgankx Oct 6, 2020
09ad749
v2.0.0 (#79)
cmccarthy1 Oct 6, 2020
0628728
Timeseries upd (#81)
Dianeod Oct 7, 2020
0afbb3a
Merge https://github.com/KxSystems/ml into cluster_streaming
dmorgankx Oct 12, 2020
6e97d86
conflict fixes
dmorgankx Oct 12, 2020
6ec16e4
conflict fixes
dmorgankx Oct 12, 2020
726c30c
conflict fixes
dmorgankx Oct 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 188 additions & 56 deletions clust/aprop.q
Original file line number Diff line number Diff line change
@@ -1,65 +1,197 @@
\d .ml

// Affinity propagation algorithm
/* data = data points in `value flip` format
/* df = distance function
/* dmp = damping coefficient
/* diag = similarity matrix diagonal value function
/. r > return list of clusters
clust.ap:{[data;df;dmp;diag]
// check distance function and diagonal value
if[not df in key clust.i.dd;clust.i.err.dd[]];
// create initial table with exemplars/matches and similarity, availability and responsibility matrices
info0:clust.i.apinit["f"$data;df;diag];
// run AP algo until there is no change in results over `0.1*count data` runs
info1:{[maxiter;info]maxiter>info`matches}[.1*count data]clust.i.apalgo[dmp]/info0;
// return list of clusters
clust.i.reindex info1`exemplars}
// Affinity Propagation

// Initialize matrices
/* data = data points in `value flip` format
/* df = distance function
/* diag = similarity matrix diagonal value
/. r > returns a dictionary with similarity, availability and responsibility matrices
/ and keys for matches and exemplars to be filled during further iterations
clust.i.apinit:{[data;df;diag]
// calculate similarity matrix values
s:@[;;:;diag raze s]'[s:clust.i.dists[data;df;data]each k;k:til n:count data 0];
// create lists/matrices of zeros for other variables
`matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f}
// @kind function
// @category clust
// @fileoverview Fit affinity propagation algorithm
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
// @param df {symbol} Distance function name within '.ml.clust.df'
// @param dmp {float} Damping coefficient
// @param diag {func} Function applied to the similarity matrix diagonal
// @param iter {dict} Max number of overall iterations and iterations
// without a change in clusters. (::) can be passed in which case the defaults
// of (`total`nochange!200 15) will be used
// @return {dict} Data, input variables, clusters and exemplars
// (`data`inputs`clt`exemplars) required for the predict method
clust.ap.fit:{[data;df;dmp;diag;iter]
data:clust.i.floatConversion[data];
defaultDict:`run`total`nochange!0 200 15;
if[iter~(::);iter:()!()];
if[99h<>type iter;'"iter must be (::) or a dictionary"];
// update iteration dictionary with user changes
updDict:defaultDict,iter;
// cluster data using AP algo
clust.i.runap[data;df;dmp;diag;til count data 0;updDict]
}

// Run affinity propagation algorithm
/* dmp = damping coefficient
/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices
/. r > returns updated info
// @kind function
// @category clust
// @fileoverview Predict clusters using AP config
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
// @param cfg {dict} `data`inputs`clt`exemplars returned by clust.ap.fit
// @return {long[]} List of predicted clusters
clust.ap.predict:{[data;cfg]
data:clust.i.floatConversion[data];
if[-1~first cfg`clt;
'"'.ml.clust.ap.fit' did not converge, all clusters returned -1. Cannot predict new data."];
// retrieve cluster centres from training data
ex:cfg[`data][;distinct cfg`exemplars];
// predict testing data clusters
clust.i.appreddist[ex;cfg[`inputs]`df]each$[0h=type data;flip;enlist]data
}


// Utilities

// @kind function
// @category private
// @fileoverview Run affinity propagation algorithm
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
// @param df {symbol} Distance function name within '.ml.clust.df'
// @param dmp {float} Damping coefficient
// @param diag {func} Function applied to the similarity matrix diagonal
// @param idxs {long[]} List of indicies to find distances for
// @param iter {dict} Max number of overall iterations and iterations
// without a change in clusters. (::) can be passed in where the defaults
// of (`total`nochange!200 15) will be used
// @return {long[]} List of clusters
clust.i.runap:{[data;df;dmp;diag;idxs;iter]
// check negative euclidean distance has been given
if[df<>`nege2dist;clust.i.err.ap[]];
// calculate distances, availability and responsibility
info0:clust.i.apinit[data;df;diag;idxs];
// initialize exemplar matrix and convergence boolean
info0,:`emat`conv`iter!((count data 0;iter`nochange)#0b;0b;iter);
// run ap algo until maximum number of iterations completed or convergence
info1:clust.i.apstop clust.i.apalgo[dmp]/info0;
// return data, inputs, clusters and exemplars
inputs:`df`dmp`diag`iter!(df;dmp;diag;iter);
exemplars:info1`exemplars;
clt:$[info1`conv;clust.i.reindex exemplars;count[data 0]#-1];
`data`inputs`clt`exemplars!(data;inputs;clt;exemplars)
}

// @kind function
// @category private
// @fileoverview Initialize matrices
// @param data {float[][]} Data in matrix format, each column is an individual datapoint
// @param df {symbol} Distance function name within '.ml.clust.df'
// @param diag {func} Function applied to the similarity matrix diagonal
// @param idxs {long[]} List of point indices
// @return {dict} Similarity, availability and responsibility matrices
// and keys for matches and exemplars to be filled during further iterations
clust.i.apinit:{[data;df;diag;idxs]
// calculate similarity matrix values
s:clust.i.dists[data;df;data]each idxs;
// update diagonal
s:@[;;:;diag raze s]'[s;k:til n:count data 0];
// create lists/matrices of zeros for other variables
`matches`exemplars`s`a`r!(0;0#0;s),(2;n;n)#0f
}

// @kind function
// @category private
// @fileoverview Run affinity propagation algorithm
// @param dmp {float} Damping coefficient
// @param info {dict} Similarity, availability, responsibility, exemplars,
// matches, iter dictionary, no_conv boolean and iter dict
// @return {dict} Updated inputs
clust.i.apalgo:{[dmp;info]
// update responsibility matrix
info[`r]:clust.i.updr[dmp;info];
// update availability matrix
info[`a]:clust.i.upda[dmp;info];
// find new exemplars
ex:imax each sum info`a`r;
// return updated `info` with new exemplars/matches
update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info}
// update responsibility matrix
info[`r]:clust.i.updr[dmp;info];
// update availability matrix
info[`a]:clust.i.upda[dmp;info];
// find new exemplars
ex:imax each sum info`a`r;
// update `info` with new exemplars/matches
info:update exemplars:ex,matches:?[exemplars~ex;matches+1;0]from info;
// update iter dictionary
.[clust.i.apconv info;(`iter;`run);+[1]]
}

// Update responsibility matrix
/* dmp = damping coefficient
/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices
/. r > returns updated responsibility matrix
// @kind function
// @category private
// @fileoverview Check affinity propagation algorithm for convergence
// @param info {dict} Similarity, availability, responsibility, exemplars,
// matches, iter dictionary, no_conv boolean and iter dict
// @return {dict} Updated info dictionary
clust.i.apconv:{[info]
// iteration dictionary
iter:info`iter;
// exemplar matrix
emat:info`emat;
// existing exemplars
ediag:0<sum clust.i.diag each info`a`r;
emat[;iter[`run]mod iter`nochange]:ediag;
// check for convergence
if[iter[`nochange]<=iter`run;
unconv:count[info`s]<>sum(se=iter`nochange)+0=se:sum each emat;
conv:$[(iter[`total]=iter`run)|not[unconv]&sum[ediag]>0;1b;0b]];
// return updated info
info,`emat`conv!(emat;conv)
}

// @kind function
// @category private
// @fileoverview Retrieve diagonal from a square matrix
// @param m {any[][]} Square matrix
// @return {any[]} Matrix diagonal
clust.i.diag:{[m]
{x y}'[m;til count m]
}

// @kind function
// @category private
// @fileoverview Update responsibility matrix
// @param dmp {float} Damping coefficient
// @param info {dict} Similarity, availability, responsibility, exemplars,
// matches, iter dictionary, no_conv boolean and iter dict
// @return {float[][]} Updated responsibility matrix
clust.i.updr:{[dmp;info]
// create matrix with every points max responsibility - diagonal becomes -inf, current max is becomes second max
mx:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]}'[sum info`s`a;til count info`r];
// calculate new responsibility
(dmp*info`r)+(1-dmp)*info[`s]-mx}
// create matrix with every points max responsibility
// diagonal becomes -inf, current max is becomes second max
mxresp:{[x;i]@[count[x]#mx;j;:;]max@[x;i,j:x?mx:max x;:;-0w]};
mx:mxresp'[sum info`s`a;til count info`r];
// calculate new responsibility
(dmp*info`r)+(1-dmp)*info[`s]-mx
}

// Update availability matrix
/* dmp = damping coefficient
/* info = dictionary containing exemplars and matches, similarity, availability and responsibility matrices
/. r > returns updated availability matrix
// @kind function
// @category private
// @fileoverview Update availability matrix
// @param dmp {float} Damping coefficient
// @param info {dict} Similarity, availability, responsibility, exemplars,
// matches, iter dictionary, no_conv boolean and iter dict
// @return {float[][]} Returns updated availability matrix
clust.i.upda:{[dmp;info]
// sum values in positive availability matrix
s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a];
// create a matrix using the negative values produced by the availability sum + responsibility diagonal - positive availability values
a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s];
// calculate new availability
(dmp*info`a)+a*1-dmp}
// sum values in positive availability matrix
s:sum@[;;:;0f]'[pv:0|info`r;k:til n:count info`a];
// create a matrix using the negative values produced by the availability sum
// + responsibility diagonal - positive availability values
a:@[;;:;]'[0&(s+info[`r]@'k)-/:pv;k;s];
// calculate new availability
(dmp*info`a)+a*1-dmp
}

// @kind function
// @category private
// @fileoverview Stopping condition for affinity propagation algorithm
// @param info {dict} Similarity, availability, responsibility, exemplars,
// matches, iter dictionary, no_conv boolean and iter dict
// @return {bool} Indicates whether to continue or stop running AP (1/0b)
clust.i.apstop:{[info]
(info[`iter;`total]>info[`iter]`run)&not 1b~info`conv
}

// @kind function
// @category private
// @fileoverview Predict clusters using AP training exemplars
// @param ex {float[][]} Training cluster centres in matrix format,
// each column is an individual datapoint
// @param df {symbol} Distance function name within '.ml.clust.df'
// @param pt {float[]} Current data point
// @return {long[]} Predicted clusters
clust.i.appreddist:{[ex;df;pt]
d?max d:clust.i.dists[ex;df;pt]each til count ex 0
}
Loading