-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Better construction, partial clustering dispatcher
Most importantly, supports priors in the sparse case. * Add outliers * Better reduction-based parallel heap construction. * Clean up assert * Fix? * Fix? * Some issues with pybind11? * Changed * Minor revisions and organization for clustering core. * Remove network_p_wasserstein * Drafted skeleton for ClusteringSolver. * Progress in general clusterer. * Update distmat, correct assertion. * Nearly finished with clustering under d2 + EM * Finish drafting both hard and soft extrinsic clustering. * Minor cosmetic alterations. * More pieces tied together for clustering. Remaining, metric k-median clustering + sparse with prior. * Save work. Remaining before release: gluing in metric k-median code, supporting priors with sparse data. * Finish drafting clusterer for all cases. * Correction. * Pull out diskmat. * Clean up details, make one interface for dispatching clustering. * Tweaks * Correct compilation. * Cluster testing. * More debugging. * Cluster testing. * Cluster testing. * Cluster testing. * Update * Correct compilation. * Correctly clustering easy k-means example. * soft clustering passes * Data generation for clustering testing. * Use b-tree instead of std::set. * Allow changing of hdf group * Add HDF5 script. * knngraph * Save * (M)-KNN graph * Save work * MKNN via LSH. * Different storage formats * Local search heuristic correction. * Remove, add fancy handlers for sparse routines * Still debugging special sparse versions. * Tweaks * Format support in jsdtest * Error in prep * bug * Integrate sparse prior test * Update test * Correct number shared zeroes calculation. * Sparse prior testing, + shared nonzeros * Corrections.
- Loading branch information
Showing
61 changed files
with
4,071 additions
and
2,832 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import numpy as np | ||
import sys | ||
import argparse | ||
|
||
try: | ||
from cytoolz import frequencies as Counter | ||
except ImportError: | ||
from collections import Counter | ||
np.random.seed(0) | ||
|
||
ap = argparse.ArgumentParser() | ||
ap.add_argument("--num-clusters", type=int, help="Number of clusters.", default=10) | ||
ap.add_argument("--num-rows", type=int, help="Number of rows.", default=5000) | ||
ap.add_argument("--num-dim", type=int, help="Number of dimensions.", default=50) | ||
ap.add_argument("--set-noise", type=float, default=1.) | ||
ap.add_argument("--set-data-variance", type=float, default=5.) | ||
ap.add_argument("--outfile", type=str, default="randombregman.out") | ||
ap.add_argument("--sample-coverage", type=int, default=1000) | ||
ap = ap.parse_args() | ||
|
||
num_clusters = ap.num_clusters | ||
num_dim = ap.num_dim | ||
num_rows = ap.num_rows | ||
|
||
assert num_rows % num_clusters == 0, "num rows must be divisible by number of clusters" | ||
|
||
# Normalize | ||
centers = np.abs(np.random.standard_cauchy(size=(num_clusters, num_dim)) * ap.set_data_variance) | ||
|
||
centers = (1. / np.sum(centers, axis=1))[:,np.newaxis] * centers | ||
|
||
datapoints = [] | ||
for i in range(num_clusters): | ||
for j in range(num_rows // num_clusters): | ||
# Generate a number of samples, and then sample them. | ||
nsamp = np.random.poisson(ap.sample_coverage) | ||
row = centers[i] + np.random.standard_normal(size=(num_dim,)) | ||
row = np.abs(row) | ||
row /= np.sum(row) | ||
selections = Counter(np.random.choice(len(row), p=row, size=(nsamp,))[:]) | ||
samples = np.zeros((num_dim,)) | ||
for k, v in selections.items(): | ||
samples[k] = v | ||
datapoints.append(samples) | ||
|
||
datapoints = np.vstack(datapoints) | ||
|
||
ordering = np.arange(0, num_rows, dtype=np.uint32) | ||
np.random.shuffle(ordering) | ||
with open(ap.outfile, "w") as ofp: | ||
ofp.write("%d/%d/%d\n" % (num_rows, num_dim, num_clusters)) | ||
for index in ordering: | ||
ofp.write(" ".join(map(str, datapoints[index,:])) + "\n") | ||
with open(ap.outfile + ".labels.txt", "w") as f: | ||
f.write("\n".join(str(ordering[i] // (num_rows // num_clusters)) for i in range(num_rows))) | ||
f.write("\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import numpy as np | ||
import sys | ||
np.random.seed(0) | ||
|
||
num_clusters = 10 | ||
num_dim = 50 | ||
num_rows = 5000 | ||
assert num_rows % num_clusters == 0 | ||
|
||
centers = np.abs(np.random.standard_normal(size=(num_clusters, num_dim)) * 5.) | ||
|
||
points = np.vstack([np.random.standard_normal(size=(num_rows // num_clusters, num_dim)) * 1 + centers[i,:][np.newaxis, :] | ||
for i in range(num_clusters)]) | ||
|
||
ordering = np.arange(0, num_rows, dtype=np.uint32) | ||
np.random.shuffle(ordering) | ||
if sys.argv[1:]: | ||
ofp = open(sys.argv[1], "w") | ||
labels = sys.argv[1] + ".labels.txt" | ||
else: | ||
ofp = open("random.out", "w") | ||
labels = "random.out.labels.txt" | ||
ofp.write("%d/%d/%d\n" % (num_rows, num_dim, num_clusters)) | ||
for index in ordering: | ||
ofp.write(" ".join(map(str, points[index,:])) + "\n") | ||
with open(labels, "w") as f: | ||
f.write("\n".join(str(ordering[i] // (num_rows // num_clusters)) for i in range(num_rows))) | ||
f.write("\n") | ||
|
||
if ofp != sys.stdout: ofp.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#ifndef MINOCORE_CLUSTERING_HEADERS_H__ | ||
#define MINOCORE_CLUSTERING_HEADERS_H__ | ||
|
||
#include "minocore/clustering/dispatch.h" | ||
#include "minocore/clustering/traits.h" | ||
#include "minocore/clustering/sampling.h" | ||
|
||
#endif /* MINOCORE_CLUSTERING_HEADERS_H__ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
#ifndef MINOCORE_CLUSTERING_CENTROID_H__ | ||
#define MINOCORE_CLUSTERING_CENTROID_H__ | ||
#include "minocore/dist.h" | ||
#include "minocore/util/blaze_adaptor.h" | ||
#include "minocore/optim/kmedian.h" | ||
|
||
namespace minocore { namespace clustering { | ||
|
||
struct CentroidPolicy { | ||
template<typename VT, bool TF, typename Range, typename VT2=VT, typename RowSums> | ||
static void perform_average(blaze::DenseVector<VT, TF> &ret, const Range &r, const RowSums &rs, | ||
const VT2 *wc = static_cast<VT2 *>(nullptr), | ||
dist::DissimilarityMeasure measure=static_cast<dist::DissimilarityMeasure>(-1)) | ||
{ | ||
using FT = blz::ElementType_t<VT>; | ||
PREC_REQ(measure != static_cast<dist::DissimilarityMeasure>(-1), "Must define dissimilarity measure"); | ||
if(measure == dist::TOTAL_VARIATION_DISTANCE) { | ||
PRETTY_SAY << "TVD: performing " << (wc ? static_cast<const char *>("weighted"): static_cast<const char *>("unweighted")) << "L1 median on *normalized* categorical distributions.\n"; | ||
if(wc) | ||
coresets::l1_median(r, ret, wc->data()); | ||
else | ||
coresets::l1_median(r, ret); | ||
} | ||
else if(measure == dist::L1) { | ||
std::conditional_t<blz::IsSparseMatrix_v<Range>, | ||
blz::CompressedMatrix<FT, blz::StorageOrder_v<Range> >, | ||
blz::DynamicMatrix<FT, blz::StorageOrder_v<Range> > | ||
> cm = r % blz::expand(trans(rs), r.columns()); | ||
PRETTY_SAY << "L1: performing " << (wc ? static_cast<const char *>("weighted"): static_cast<const char *>("unweighted")) << "L1 median on *unnormalized* categorical distributions, IE absolute count data.\n"; | ||
if(wc) | ||
coresets::l1_median(cm, ret, wc->data()); | ||
else | ||
coresets::l1_median(cm, ret); | ||
} else if(measure == dist::LLR || measure == dist::UWLLR || measure == dist::OLLR) { | ||
PRETTY_SAY << "LLR test\n"; | ||
FT total_sum_inv; | ||
if(wc) { | ||
total_sum_inv = 1. / blz::dot(rs, *wc); | ||
~ret = blaze::sum<blz::columnwise>(r % blz::expand(trans(*wc * rs), r.columns())) * total_sum_inv; | ||
} else { | ||
total_sum_inv = 1. / blaze::sum(rs); | ||
~ret = blaze::sum<blz::columnwise>(r % blz::expand(trans(rs), r.columns())) * total_sum_inv; | ||
} | ||
} else if(wc) { | ||
PRETTY_SAY << "Weighted, anything but L1 or LLR" << dist::detail::prob2str(measure) << '\n'; | ||
assert((~(*wc)).size() == r.rows()); | ||
assert(blz::expand(~(*wc), r.columns()).rows() == r.rows()); | ||
assert(blz::expand(~(*wc), r.columns()).columns() == r.columns()); | ||
auto wsuminv = 1. / blaze::sum(*wc); | ||
if(!dist::detail::is_probability(measure)) { // e.g., take mean of unscaled values | ||
auto mat2schur = blz::expand(~(*wc) * rs, r.columns()); | ||
PRETTY_SAY << "NOTPROB r dims: " << r.rows() << "/" << r.columns() << '\n'; | ||
PRETTY_SAY << "NOTPROB mat2schur dims: " << mat2schur.rows() << "/" << mat2schur.columns() << '\n'; | ||
~ret = blaze::sum<blz::columnwise>(r % blz::expand(~(*wc) * rs, r.columns())) * wsuminv; | ||
} else { // Else take mean of scaled values | ||
auto mat2schur = blz::expand(~(*wc), r.columns()); | ||
PRETTY_SAY << "PROB r dims: " << r.rows() << "/" << r.columns() << '\n'; | ||
PRETTY_SAY << "PROB mat2schur dims: " << mat2schur.rows() << "/" << mat2schur.columns() << '\n'; | ||
~ret = blaze::sum<blz::columnwise>(r % blz::expand(~(*wc), r.columns())) * wsuminv; | ||
assert(blaze::max(~ret) < 1. || !std::fprintf(stderr, "max in ret: %g for a probability distribution.", blaze::max(~ret))); | ||
} | ||
} else { | ||
PRETTY_SAY << "Unweighted, anything but L1 or LLR" << dist::detail::prob2str(measure) << '\n'; | ||
if(dist::detail::is_probability(measure)) { | ||
// Weighted average for all | ||
#ifndef NDEBUG | ||
auto expansion = blz::expand(trans(rs), r.columns()); | ||
PRETTY_SAY << "PROB r dims: " << r.rows() << "/" << r.columns() << '\n'; | ||
PRETTY_SAY << "NOTPROB expansion dims: " << expansion.rows() << "/" << expansion.columns() << '\n'; | ||
#endif | ||
~ret = blaze::sum<blz::columnwise>(r % blz::expand(trans(rs), r.columns())) * (1. / (blaze::sum(rs) * r.rows())); | ||
} else ~ret = blz::mean<blz::columnwise>(r % blz::expand(trans(rs), r.columns())); | ||
} | ||
} | ||
template<typename FT, typename Row, typename Src> | ||
static void __perform_increment(FT neww, FT cw, Row &ret, const Src &dat, FT row_sum, dist::DissimilarityMeasure measure) | ||
{ | ||
if(measure == dist::L1 || measure == dist::TOTAL_VARIATION_DISTANCE) | ||
throw std::invalid_argument("__perform_increment is only for linearly-calculated means, not l1 median"); | ||
if(cw == 0.) { | ||
if(dist::detail::is_probability(measure)) | ||
ret = dat; | ||
else | ||
ret = dat * row_sum; | ||
} else { | ||
auto div = neww / (neww + cw); | ||
if(dist::detail::is_probability(measure)) { | ||
ret += (dat - ret) * div; | ||
} else if(measure == dist::LLR || measure == dist::UWLLR) { | ||
ret += (dat * row_sum) * neww; | ||
// Add up total sum and subtract later | ||
// since there are three weighting factors here: | ||
// First, partial assignment | ||
// Then point-wise weights (both of which are in neww) | ||
// Then, for LLR/UWLLR, there's weighting by the row-sums | ||
} else { | ||
// Maintain running mean for full vector value | ||
ret += (dat * row_sum - ret) * div; | ||
} | ||
} | ||
} | ||
|
||
template<typename VT, bool TF, typename RowSums, typename MatType, typename CenterCon, typename VT2=blz::DynamicVector<blz::ElementType_t<VT>> > | ||
static void perform_soft_assignment(const blz::DenseMatrix<VT, TF> &assignments, | ||
const RowSums &rs, | ||
OMP_ONLY(std::mutex *mutptr,) | ||
const MatType &data, CenterCon &newcon, | ||
const VT2 *wc = static_cast<const VT2 *>(nullptr), | ||
dist::DissimilarityMeasure measure=static_cast<dist::DissimilarityMeasure>(-1)) | ||
{ | ||
using FT = blz::ElementType_t<VT>; | ||
PREC_REQ(measure != static_cast<dist::DissimilarityMeasure>(-1), "Must define dissimilarity measure"); | ||
if(measure == dist::L1 || measure == dist::TOTAL_VARIATION_DISTANCE) { | ||
OMP_PFOR | ||
for(unsigned j = 0; j < newcon.size(); ++j) { | ||
blz::DynamicVector<FT, blz::rowVector> newweights; | ||
{ | ||
auto col = trans(column(assignments, j)); | ||
if(wc) newweights = col * *wc; | ||
else newweights = col; | ||
} | ||
if(measure == dist::L1) { | ||
std::conditional_t<blz::IsDenseMatrix_v<VT>, | ||
blz::DynamicMatrix<FT>, blz::CompressedMatrix<FT>> | ||
scaled_data = data % blz::expand(rs, data.columns()); | ||
coresets::l1_median(scaled_data, newcon[j], newweights.data()); | ||
} else { // TVD | ||
coresets::l1_median(data, newcon[j], newweights.data()); | ||
} | ||
} | ||
} else { | ||
blz::DynamicVector<FT> summed_contribs(newcon.size(), 0.); | ||
OMP_PFOR | ||
for(size_t i = 0; i < data.rows(); ++i) { | ||
auto item_weight = wc ? wc->operator[](i): static_cast<FT>(1.); | ||
const auto row_sum = rs[i]; | ||
auto asn(row(assignments, i, blz::unchecked)); | ||
for(size_t j = 0; j < newcon.size(); ++j) { | ||
auto &cw = summed_contribs[j]; | ||
if(auto asnw = asn[j]; asnw > 0.) { | ||
auto neww = item_weight * asnw; | ||
OMP_ONLY(if(mutptr) mutptr[j].lock();) | ||
__perform_increment(neww, cw, newcon[j], row(data, i, blz::unchecked), row_sum, measure); | ||
OMP_ONLY(if(mutptr) mutptr[j].unlock();) | ||
OMP_ATOMIC | ||
cw += neww; | ||
} | ||
} | ||
} | ||
if(measure == dist::LLR || measure == dist::UWLLR || measure == dist::OLLR) { | ||
OMP_PFOR | ||
for(auto i = 0u; i < newcon.size(); ++i) | ||
newcon[i] *= 1. / blz::dot(column(assignments, i), rs); | ||
} | ||
} | ||
} | ||
}; // CentroidPolicy | ||
|
||
} } // namespace minocore::clustering | ||
|
||
#endif /* MINOCORE_CLUSTERING_CENTROID_H__ */ |
Oops, something went wrong.