diff --git a/scedar/cluster/__init__.py b/scedar/cluster/__init__.py index 33294e5..13cc3c5 100644 --- a/scedar/cluster/__init__.py +++ b/scedar/cluster/__init__.py @@ -1,3 +1,4 @@ from scedar.cluster.mirac import MIRAC +from scedar.cluster.community import Community -__all__ = ["mirac"] +__all__ = ["mirac", "community"] diff --git a/scedar/cluster/community.py b/scedar/cluster/community.py new file mode 100644 index 0000000..f61838e --- /dev/null +++ b/scedar/cluster/community.py @@ -0,0 +1,165 @@ +import numpy as np + +from scedar.eda import SampleDistanceMatrix +from scedar.eda.slcs import SingleLabelClassifiedSamples as SLCS +from scedar import utils + +import leidenalg as la + +class Community(object): + """ + Community clustering + + Args + ---- + x : float array + Data matrix. + d : float array + Distance matrix. + graph: igraph.Graph + Need to have a weight attribute as affinity. If this argument + is not None, the graph will directly be used for community + clustering. + metric: {'cosine', 'euclidean'} + Metric used for nearest neighbor computation. + sids : sid list + List of sample ids. + fids : fid list + List of feature ids. + use_pdist : boolean + to use the pairwise distance matrix or not. The pairwise distance + matrix may be too large to save for datasets with a large number of + cells. + k : int + The number of nearest neighbors. + use_pca : bool + Use PCA for nearest neighbors or not. + use_hnsw : bool + Use Hierarchical Navigable Small World graph to compute + approximate nearest neighbor. + index_params : dict + Parameters used by HNSW in indexing. + efConstruction : int + Default 100. Higher value improves the quality of a constructed + graph and leads to higher accuracy of search. However this also + leads to longer indexing times. The reasonable range of values + is 100-2000. + M : int + Default 5. Higher value leads to better recall and shorter + retrieval times, at the expense of longer indexing time. The + reasonable range of values is 5-100. + delaunay_type : {0, 1, 2, 3} + Default 2. Pruning heuristic, which affects the trade-off + between retrieval performance and indexing time. The default + is usually quite good. + post : {0, 1, 2} + Default 0. The amount and type of postprocessing applied to the + constructed graph. 0 means no processing. 2 means more + processing. + indexThreadQty : int + Default self._nprocs. The number of threads used. + query_params : dict + Parameters used by HNSW in querying. + efSearch : int + Default 100. Higher value improves recall at the expense of + longer retrieval time. The reasonable range of values is + 100-2000. + aff_scale : float > 0 + Scaling factor used for converting distance to affinity. + Affinity = (max(distance) - distance) * aff_scale. + partition_method : str + Following methods are implemented in leidenalg package: + - RBConfigurationVertexPartition: only well-defined for positive edge + weights. + - RBERVertexPartition: well-defined only for positive edge weights. + - CPMVertexPartition: well-defined for both positive and negative edge + weights. + - SignificanceVertexPartition: well-defined only for unweighted graphs. + - SurpriseVertexPartition: well-defined only for positive edge weights. + resolution : float > 0 + Resolution used for community clustering. Higer value produces more + clusters. + random_state : int + Random number generator seed used for community clustering. + n_iter : int + Number of iterations used for community clustering. + nprocs : int > 0 + The number of processes/cores used for community clustering. + verbose : bool + Print progress or not. + + Attributes + ---------- + labs : label list + Labels of clustered samples. 1-to-1 matching to + from first to last. + _sdm : SampleDistanceMatrix + Data and distance matrices. + _graph : igraph.Graph + Graph used for clustering. + _la_res : leidenalg.VertexPartition + Partition results computed by leidenalg. + _k + _use_pca + _use_hnsw + _index_params + _query_params + _aff_scale + """ + + def __init__(self, x, d=None, graph=None, + metric="cosine", sids=None, fids=None, + use_pdist=False, k=15, use_pca=True, use_hnsw=True, + index_params=None, query_params=None, aff_scale=1, + partition_method="RBConfigurationVertexPartition", + resolution=1, random_state=None, n_iter=2, + nprocs=1, verbose=False): + super().__init__() + if aff_scale <= 0: + raise ValueError("Affinity scaling (aff_scale) shoud > 0.") + + if metric not in ("cosine", "euclidean"): + raise ValueError("Metric only supports cosine and euclidean.") + + self._sdm = SampleDistanceMatrix(x=x, d=d, metric=metric, + use_pdist=use_pdist, + sids=sids, fids=fids, nprocs=nprocs) + if graph is None: + knn_conn_mat = self._sdm.s_knn_connectivity_matrix( + k=k, use_pca=use_pca, use_hnsw=use_hnsw, + index_params=index_params, query_params=query_params, + verbose=verbose) + graph = SampleDistanceMatrix.knn_conn_mat_to_aff_graph( + knn_conn_mat, aff_scale=aff_scale) + + if partition_method == "RBConfigurationVertexPartition": + la_part_cls = la.RBConfigurationVertexPartition + elif partition_method == "RBERVertexPartition": + la_part_cls = la.RBERVertexPartition + elif partition_method == "CPMVertexPartition": + la_part_cls = la.CPMVertexPartition + elif partition_method == "SignificanceVertexPartition": + la_part_cls = la.SignificanceVertexPartition + elif partition_method == "SurpriseVertexPartition": + la_part_cls = la.SurpriseVertexPartition + else: + raise ValueError( + "Unknown partition method: {}".format(partition_method)) + + la_res = la.find_partition(graph, la.RBConfigurationVertexPartition, + seed=random_state, weights='weight', + resolution_parameter=resolution) + # keep track of results and parameters + self._graph = graph + self._la_res = la_res + self._labs = la_res.membership + self._k = k + self._use_pca = use_pca + self._use_hnsw = use_hnsw + self._index_params = index_params + self._query_params = query_params + self._aff_scale = aff_scale + + @property + def labs(self): + return self._labs.copy() diff --git a/scedar/eda/sdm.py b/scedar/eda/sdm.py index 7d19154..ec1b7d2 100644 --- a/scedar/eda/sdm.py +++ b/scedar/eda/sdm.py @@ -19,8 +19,11 @@ from collections import defaultdict import networkx as nx + from fa2 import ForceAtlas2 +import igraph as ig + from umap import UMAP import nmslib @@ -760,7 +763,6 @@ def s_knn_ind_lut(self, k): knn_order_ind_lut[ikey] = [t[0] for t in d_sorted_v[0:k]] return knn_order_ind_lut - # TODO: record as attribute def s_knn_connectivity_matrix(self, k, metric=None, use_pca=False, use_hnsw=False, index_params=None, query_params=None, @@ -785,14 +787,14 @@ def s_knn_connectivity_matrix(self, k, metric=None, use_pca=False, index_params: dict Parameters used by HNSW in indexing. efConstruction: int - Default 100. Higher value improves the quality of a constructed graph and - leads to higher accuracy of search. However this also leads to - longer indexing times. The reasonable range of values is - 100-2000. + Default 100. Higher value improves the quality of a constructed + graph and leads to higher accuracy of search. However this also + leads to longer indexing times. The reasonable range of values + is 100-2000. M: int - Default 5. Higher value leads to better recall and shorter retrieval - times, at the expense of longer indexing time. The reasonable - range of values is 5-100. + Default 5. Higher value leads to better recall and shorter + retrieval times, at the expense of longer indexing time. The + reasonable range of values is 5-100. delaunay_type: {0, 1, 2, 3} Default 2. Pruning heuristic, which affects the trade-off between retrieval performance and indexing time. The default @@ -846,7 +848,6 @@ def _s_knn_conn_mat_hnsw(self, k, metric=None, use_pca=False, if metric is None: metric = self._metric - if use_pca: data_x = self._pca_x is_sparse = False @@ -912,7 +913,8 @@ def _s_knn_conn_mat_hnsw(self, k, metric=None, use_pca=False, knn_con_mat = spsparse.coo_matrix( (knn_weights, (knn_sources, knn_targets)), shape=(data_x.shape[0], data_x.shape[0])) - return knn_con_mat + knn_con_csr = knn_con_mat.tocsr() + return knn_con_csr def _s_knn_conn_mat_skl(self, k, metric=None, use_pca=False, verbose=False): @@ -944,6 +946,16 @@ def _s_knn_conn_mat_skl(self, k, metric=None, use_pca=False, metric=metric, include_self=False) return knn_conn_mat + + @staticmethod + def knn_conn_mat_to_aff_graph(knn_conn_mat, aff_scale=1): + sources, targets = knn_conn_mat.nonzero() + weights = knn_conn_mat[sources, targets].A1 + weights = (weights.max() - weights) * aff_scale + graph = ig.Graph(edges=list(zip(sources, targets)), + directed=False, edge_attrs={"weight": weights}) + return graph + def s_knn_graph(self, k, gradient=None, labels=None, different_label_markers=True, aff_scale=1, @@ -989,7 +1001,6 @@ def s_knn_graph(self, k, gradient=None, labels=None, if knn_ng_param_key in self._knn_ng_lut: ng = self._knn_ng_lut[knn_ng_param_key] else: - # TODO: support sparse matrix knn_d_csr = self.s_knn_connectivity_matrix(k) # affinity shoud negatively correlate to distance knn_d_csr.data = (knn_d_csr.max() - knn_d_csr.data) * aff_scale diff --git a/setup.cfg b/setup.cfg index c0b054a..b9202dc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,3 +3,4 @@ test=pytest [tool:pytest] addopts = --verbose +filterwarnings = ignore:.*(imp module|SafeConfigParser):DeprecationWarning diff --git a/tests/test_cluster/test_community.py b/tests/test_cluster/test_community.py new file mode 100644 index 0000000..a909854 --- /dev/null +++ b/tests/test_cluster/test_community.py @@ -0,0 +1,51 @@ +import pytest + +import numpy as np + +import sklearn.datasets as skdset +from sklearn import metrics + +import scedar.cluster as cluster +import scedar.eda as eda + + +class TestCommunity(object): + '''docstring for TestMIRAC''' + np.random.seed(123) + x_20x5 = np.random.uniform(size=(20, 5)) + + def test_simple_run(self): + cluster.Community(self.x_20x5).labs + + def test_wrong_args(self): + with pytest.raises(ValueError): + cluster.Community(self.x_20x5, aff_scale=-0.1).labs + with pytest.raises(ValueError): + cluster.Community(self.x_20x5, metric='123').labs + with pytest.raises(ValueError): + cluster.Community(self.x_20x5, metric='correlation').labs + with pytest.raises(ValueError): + cluster.Community( + self.x_20x5, partition_method='NotImplementedMethod').labs + + def test_different_partition_methods(self): + cluster.Community( + self.x_20x5, + partition_method="RBConfigurationVertexPartition").labs + cluster.Community( + self.x_20x5, partition_method="RBERVertexPartition").labs + cluster.Community( + self.x_20x5, partition_method="CPMVertexPartition").labs + cluster.Community( + self.x_20x5, partition_method="SignificanceVertexPartition").labs + cluster.Community( + self.x_20x5, partition_method="SurpriseVertexPartition").labs + + def test_provide_graph(self): + sdm = eda.SampleDistanceMatrix(self.x_20x5) + knn_conn_mat = sdm.s_knn_connectivity_matrix(5) + knn_aff_graph = eda.SampleDistanceMatrix.knn_conn_mat_to_aff_graph( + knn_conn_mat, 2) + cluster.Community( + self.x_20x5, graph=knn_aff_graph, + partition_method="RBConfigurationVertexPartition").labs diff --git a/tests/test_eda/test_slcs.py b/tests/test_eda/test_slcs.py index c91c58e..54175b8 100644 --- a/tests/test_eda/test_slcs.py +++ b/tests/test_eda/test_slcs.py @@ -1037,7 +1037,7 @@ def test_encode_mdl(self): emdl = mdl_slcs.encode(np.arange(100).reshape(-1, 5)) emdl2 = mdl_slcs.encode(np.arange(100).reshape(-1, 5), nprocs=2) - assert emdl == emdl2 + np.testing.assert_approx_equal(emdl, emdl2) emdl3 = eda.MDLSingleLabelClassifiedSamples( self.x50x5, mdl_method=eda.mdl.GKdeMdl, labs=self.labs50,