Add community/Louvain/Leiden clustering method

Used community detection / clustering implementations from vtraag/leidenalg, https://github.com/vtraag/leidenalg.
TaylorResearchLab · Dec 17, 2019 · c938bea · c938bea
1 parent 57aebda
commit c938bea
Show file tree

Hide file tree

Showing 6 changed files with 242 additions and 13 deletions.
diff --git a/scedar/cluster/__init__.py b/scedar/cluster/__init__.py
@@ -1,3 +1,4 @@
 from scedar.cluster.mirac import MIRAC
+from scedar.cluster.community import Community
 
-__all__ = ["mirac"]
+__all__ = ["mirac", "community"]
diff --git a/scedar/cluster/community.py b/scedar/cluster/community.py
@@ -0,0 +1,165 @@
+import numpy as np
+
+from scedar.eda import SampleDistanceMatrix
+from scedar.eda.slcs import SingleLabelClassifiedSamples as SLCS
+from scedar import utils
+
+import leidenalg as la
+
+class Community(object):
+    """
+    Community clustering
+
+    Args
+    ----
+    x : float array
+        Data matrix.
+    d : float array
+        Distance matrix.
+    graph: igraph.Graph
+        Need to have a weight attribute as affinity. If this argument
+        is not None, the graph will directly be used for community
+        clustering.
+    metric: {'cosine', 'euclidean'}
+        Metric used for nearest neighbor computation.
+    sids : sid list
+        List of sample ids.
+    fids : fid list
+        List of feature ids.
+    use_pdist : boolean
+        to use the pairwise distance matrix or not. The pairwise distance
+        matrix may be too large to save for datasets with a large number of
+        cells.
+    k : int
+        The number of nearest neighbors.
+    use_pca : bool
+        Use PCA for nearest neighbors or not.
+    use_hnsw : bool
+        Use Hierarchical Navigable Small World graph to compute
+        approximate nearest neighbor.
+    index_params : dict
+        Parameters used by HNSW in indexing.
+        efConstruction : int
+            Default 100. Higher value improves the quality of a constructed
+            graph and leads to higher accuracy of search. However this also
+            leads to longer indexing times. The reasonable range of values
+            is 100-2000.
+        M : int
+            Default 5. Higher value leads to better recall and shorter 
+            retrieval times, at the expense of longer indexing time. The
+            reasonable range of values is 5-100.
+        delaunay_type : {0, 1, 2, 3}
+            Default 2. Pruning heuristic, which affects the trade-off
+            between retrieval performance and indexing time. The default
+            is usually quite good.
+        post : {0, 1, 2}
+            Default 0. The amount and type of postprocessing applied to the
+            constructed graph. 0 means no processing. 2 means more
+            processing.
+        indexThreadQty : int
+            Default self._nprocs. The number of threads used.
+    query_params : dict
+        Parameters used by HNSW in querying.
+        efSearch : int
+            Default 100. Higher value improves recall at the expense of
+            longer retrieval time. The reasonable range of values is
+            100-2000.
+    aff_scale : float > 0
+        Scaling factor used for converting distance to affinity.
+        Affinity = (max(distance) - distance) * aff_scale.
+    partition_method : str
+        Following methods are implemented in leidenalg package:
+        - RBConfigurationVertexPartition: only well-defined for positive edge
+          weights.
+        - RBERVertexPartition: well-defined only for positive edge weights.
+        - CPMVertexPartition: well-defined for both positive and negative edge
+          weights.
+        - SignificanceVertexPartition: well-defined only for unweighted graphs.
+        - SurpriseVertexPartition: well-defined only for positive edge weights.
+    resolution : float > 0
+        Resolution used for community clustering. Higer value produces more
+        clusters.
+    random_state : int
+        Random number generator seed used for community clustering.
+    n_iter : int
+        Number of iterations used for community clustering.
+    nprocs : int > 0
+        The number of processes/cores used for community clustering.
+    verbose : bool
+        Print progress or not.
+
+    Attributes
+    ----------
+        labs : label list
+            Labels of clustered samples. 1-to-1 matching to
+            from first to last.
+        _sdm : SampleDistanceMatrix
+            Data and distance matrices.
+        _graph : igraph.Graph
+            Graph used for clustering.
+        _la_res : leidenalg.VertexPartition
+            Partition results computed by leidenalg.
+        _k
+        _use_pca
+        _use_hnsw
+        _index_params
+        _query_params
+        _aff_scale
+    """
+
+    def __init__(self, x, d=None, graph=None,
+                 metric="cosine", sids=None, fids=None,
+                 use_pdist=False,  k=15, use_pca=True, use_hnsw=True,
+                 index_params=None, query_params=None, aff_scale=1,
+                 partition_method="RBConfigurationVertexPartition",
+                 resolution=1, random_state=None, n_iter=2,
+                 nprocs=1, verbose=False):
+        super().__init__()
+        if aff_scale <= 0:
+            raise ValueError("Affinity scaling (aff_scale) shoud > 0.")
+
+        if metric not in ("cosine", "euclidean"):
+            raise ValueError("Metric only supports cosine and euclidean.")
+
+        self._sdm = SampleDistanceMatrix(x=x, d=d, metric=metric,
+                                         use_pdist=use_pdist,
+                                         sids=sids, fids=fids, nprocs=nprocs)
+        if graph is None:
+            knn_conn_mat = self._sdm.s_knn_connectivity_matrix(
+                k=k, use_pca=use_pca, use_hnsw=use_hnsw,
+                index_params=index_params, query_params=query_params,
+                verbose=verbose)
+            graph = SampleDistanceMatrix.knn_conn_mat_to_aff_graph(
+                knn_conn_mat, aff_scale=aff_scale)
+
+        if partition_method == "RBConfigurationVertexPartition":
+            la_part_cls = la.RBConfigurationVertexPartition
+        elif partition_method == "RBERVertexPartition":
+            la_part_cls = la.RBERVertexPartition
+        elif partition_method == "CPMVertexPartition":
+            la_part_cls = la.CPMVertexPartition
+        elif partition_method == "SignificanceVertexPartition":
+            la_part_cls = la.SignificanceVertexPartition
+        elif partition_method == "SurpriseVertexPartition":
+            la_part_cls = la.SurpriseVertexPartition
+        else:
+            raise ValueError(
+                "Unknown partition method: {}".format(partition_method))
+
+        la_res = la.find_partition(graph, la.RBConfigurationVertexPartition,
+                                   seed=random_state, weights='weight',
+                                   resolution_parameter=resolution)
+        # keep track of results and parameters
+        self._graph = graph
+        self._la_res = la_res
+        self._labs = la_res.membership
+        self._k = k
+        self._use_pca = use_pca
+        self._use_hnsw = use_hnsw
+        self._index_params = index_params
+        self._query_params = query_params
+        self._aff_scale = aff_scale
+
+    @property
+    def labs(self):
+        return self._labs.copy()
diff --git a/scedar/eda/sdm.py b/scedar/eda/sdm.py
@@ -19,8 +19,11 @@
 from collections import defaultdict
 
 import networkx as nx
+
 from fa2 import ForceAtlas2
 
+import igraph as ig
+
 from umap import UMAP
 
 import nmslib
@@ -760,7 +763,6 @@ def s_knn_ind_lut(self, k):
                 knn_order_ind_lut[ikey] = [t[0] for t in d_sorted_v[0:k]]
             return knn_order_ind_lut
 
-    # TODO: record as attribute
     def s_knn_connectivity_matrix(self, k, metric=None, use_pca=False,
                                   use_hnsw=False,
                                   index_params=None, query_params=None,
@@ -785,14 +787,14 @@ def s_knn_connectivity_matrix(self, k, metric=None, use_pca=False,
         index_params: dict
             Parameters used by HNSW in indexing.
             efConstruction: int
-                Default 100. Higher value improves the quality of a constructed graph and
-                leads to higher accuracy of search. However this also leads to
-                longer indexing times. The reasonable range of values is
-                100-2000.
+                Default 100. Higher value improves the quality of a constructed
+                graph and leads to higher accuracy of search. However this also
+                leads to longer indexing times. The reasonable range of values
+                is 100-2000.
             M: int
-                Default 5. Higher value leads to better recall and shorter retrieval
-                times, at the expense of longer indexing time. The reasonable
-                range of values is 5-100.
+                Default 5. Higher value leads to better recall and shorter 
+                retrieval times, at the expense of longer indexing time. The
+                reasonable range of values is 5-100.
             delaunay_type: {0, 1, 2, 3}
                 Default 2. Pruning heuristic, which affects the trade-off
                 between retrieval performance and indexing time. The default
@@ -846,7 +848,6 @@ def _s_knn_conn_mat_hnsw(self, k, metric=None, use_pca=False,
         if metric is None:
             metric = self._metric
 
-
         if use_pca:
             data_x = self._pca_x
             is_sparse = False
@@ -912,7 +913,8 @@ def _s_knn_conn_mat_hnsw(self, k, metric=None, use_pca=False,
         knn_con_mat = spsparse.coo_matrix(
             (knn_weights, (knn_sources, knn_targets)),
             shape=(data_x.shape[0], data_x.shape[0]))
-        return knn_con_mat
+        knn_con_csr = knn_con_mat.tocsr()
+        return knn_con_csr
 
     def _s_knn_conn_mat_skl(self, k, metric=None, use_pca=False,
                             verbose=False):
@@ -944,6 +946,16 @@ def _s_knn_conn_mat_skl(self, k, metric=None, use_pca=False,
                                                 metric=metric,
                                                 include_self=False)
         return knn_conn_mat
+
+    @staticmethod
+    def knn_conn_mat_to_aff_graph(knn_conn_mat, aff_scale=1):
+        sources, targets = knn_conn_mat.nonzero()
+        weights = knn_conn_mat[sources, targets].A1
+        weights = (weights.max() - weights) * aff_scale
+        graph = ig.Graph(edges=list(zip(sources, targets)),
+                         directed=False, edge_attrs={"weight": weights})
+        return graph
+
 
     def s_knn_graph(self, k, gradient=None, labels=None,
                     different_label_markers=True, aff_scale=1,
@@ -989,7 +1001,6 @@ def s_knn_graph(self, k, gradient=None, labels=None,
         if knn_ng_param_key in self._knn_ng_lut:
             ng = self._knn_ng_lut[knn_ng_param_key]
         else:
-            # TODO: support sparse matrix
             knn_d_csr = self.s_knn_connectivity_matrix(k)
             # affinity shoud negatively correlate to distance
             knn_d_csr.data = (knn_d_csr.max() - knn_d_csr.data) * aff_scale

diff --git a/setup.cfg b/setup.cfg
@@ -3,3 +3,4 @@ test=pytest
 
 [tool:pytest]
 addopts = --verbose
+filterwarnings = ignore:.*(imp module|SafeConfigParser):DeprecationWarning
diff --git a/tests/test_cluster/test_community.py b/tests/test_cluster/test_community.py
@@ -0,0 +1,51 @@
+import pytest
+
+import numpy as np
+
+import sklearn.datasets as skdset
+from sklearn import metrics
+
+import scedar.cluster as cluster
+import scedar.eda as eda
+
+
+class TestCommunity(object):
+    '''docstring for TestMIRAC'''
+    np.random.seed(123)
+    x_20x5 = np.random.uniform(size=(20, 5))
+
+    def test_simple_run(self):
+        cluster.Community(self.x_20x5).labs
+
+    def test_wrong_args(self):
+        with pytest.raises(ValueError):
+            cluster.Community(self.x_20x5, aff_scale=-0.1).labs
+        with pytest.raises(ValueError):
+            cluster.Community(self.x_20x5, metric='123').labs
+        with pytest.raises(ValueError):
+            cluster.Community(self.x_20x5, metric='correlation').labs
+        with pytest.raises(ValueError):
+            cluster.Community(
+                self.x_20x5, partition_method='NotImplementedMethod').labs
+
+    def test_different_partition_methods(self):
+        cluster.Community(
+            self.x_20x5, 
+            partition_method="RBConfigurationVertexPartition").labs
+        cluster.Community(
+            self.x_20x5, partition_method="RBERVertexPartition").labs
+        cluster.Community(
+            self.x_20x5, partition_method="CPMVertexPartition").labs
+        cluster.Community(
+            self.x_20x5, partition_method="SignificanceVertexPartition").labs
+        cluster.Community(
+            self.x_20x5, partition_method="SurpriseVertexPartition").labs
+
+    def test_provide_graph(self):
+        sdm = eda.SampleDistanceMatrix(self.x_20x5)
+        knn_conn_mat = sdm.s_knn_connectivity_matrix(5)
+        knn_aff_graph = eda.SampleDistanceMatrix.knn_conn_mat_to_aff_graph(
+            knn_conn_mat, 2)
+        cluster.Community(
+            self.x_20x5, graph=knn_aff_graph,
+            partition_method="RBConfigurationVertexPartition").labs
diff --git a/tests/test_eda/test_slcs.py b/tests/test_eda/test_slcs.py
@@ -1037,7 +1037,7 @@ def test_encode_mdl(self):
         emdl = mdl_slcs.encode(np.arange(100).reshape(-1, 5))
         emdl2 = mdl_slcs.encode(np.arange(100).reshape(-1, 5), nprocs=2)
 
-        assert emdl == emdl2
+        np.testing.assert_approx_equal(emdl, emdl2)
 
         emdl3 = eda.MDLSingleLabelClassifiedSamples(
             self.x50x5, mdl_method=eda.mdl.GKdeMdl, labs=self.labs50,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,4 @@ test=pytest

		[tool:pytest]
		addopts = --verbose
		filterwarnings = ignore:.*(imp module\|SafeConfigParser):DeprecationWarning