Skip to content

Commit

Permalink
Add community/Louvain/Leiden clustering method
Browse files Browse the repository at this point in the history
Used community detection / clustering implementations from
vtraag/leidenalg, https://github.com/vtraag/leidenalg.
  • Loading branch information
logstar committed Dec 17, 2019
1 parent 57aebda commit c938bea
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 13 deletions.
3 changes: 2 additions & 1 deletion scedar/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from scedar.cluster.mirac import MIRAC
from scedar.cluster.community import Community

__all__ = ["mirac"]
__all__ = ["mirac", "community"]
165 changes: 165 additions & 0 deletions scedar/cluster/community.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import numpy as np

from scedar.eda import SampleDistanceMatrix
from scedar.eda.slcs import SingleLabelClassifiedSamples as SLCS
from scedar import utils

import leidenalg as la

class Community(object):
"""
Community clustering
Args
----
x : float array
Data matrix.
d : float array
Distance matrix.
graph: igraph.Graph
Need to have a weight attribute as affinity. If this argument
is not None, the graph will directly be used for community
clustering.
metric: {'cosine', 'euclidean'}
Metric used for nearest neighbor computation.
sids : sid list
List of sample ids.
fids : fid list
List of feature ids.
use_pdist : boolean
to use the pairwise distance matrix or not. The pairwise distance
matrix may be too large to save for datasets with a large number of
cells.
k : int
The number of nearest neighbors.
use_pca : bool
Use PCA for nearest neighbors or not.
use_hnsw : bool
Use Hierarchical Navigable Small World graph to compute
approximate nearest neighbor.
index_params : dict
Parameters used by HNSW in indexing.
efConstruction : int
Default 100. Higher value improves the quality of a constructed
graph and leads to higher accuracy of search. However this also
leads to longer indexing times. The reasonable range of values
is 100-2000.
M : int
Default 5. Higher value leads to better recall and shorter
retrieval times, at the expense of longer indexing time. The
reasonable range of values is 5-100.
delaunay_type : {0, 1, 2, 3}
Default 2. Pruning heuristic, which affects the trade-off
between retrieval performance and indexing time. The default
is usually quite good.
post : {0, 1, 2}
Default 0. The amount and type of postprocessing applied to the
constructed graph. 0 means no processing. 2 means more
processing.
indexThreadQty : int
Default self._nprocs. The number of threads used.
query_params : dict
Parameters used by HNSW in querying.
efSearch : int
Default 100. Higher value improves recall at the expense of
longer retrieval time. The reasonable range of values is
100-2000.
aff_scale : float > 0
Scaling factor used for converting distance to affinity.
Affinity = (max(distance) - distance) * aff_scale.
partition_method : str
Following methods are implemented in leidenalg package:
- RBConfigurationVertexPartition: only well-defined for positive edge
weights.
- RBERVertexPartition: well-defined only for positive edge weights.
- CPMVertexPartition: well-defined for both positive and negative edge
weights.
- SignificanceVertexPartition: well-defined only for unweighted graphs.
- SurpriseVertexPartition: well-defined only for positive edge weights.
resolution : float > 0
Resolution used for community clustering. Higer value produces more
clusters.
random_state : int
Random number generator seed used for community clustering.
n_iter : int
Number of iterations used for community clustering.
nprocs : int > 0
The number of processes/cores used for community clustering.
verbose : bool
Print progress or not.
Attributes
----------
labs : label list
Labels of clustered samples. 1-to-1 matching to
from first to last.
_sdm : SampleDistanceMatrix
Data and distance matrices.
_graph : igraph.Graph
Graph used for clustering.
_la_res : leidenalg.VertexPartition
Partition results computed by leidenalg.
_k
_use_pca
_use_hnsw
_index_params
_query_params
_aff_scale
"""

def __init__(self, x, d=None, graph=None,
metric="cosine", sids=None, fids=None,
use_pdist=False, k=15, use_pca=True, use_hnsw=True,
index_params=None, query_params=None, aff_scale=1,
partition_method="RBConfigurationVertexPartition",
resolution=1, random_state=None, n_iter=2,
nprocs=1, verbose=False):
super().__init__()
if aff_scale <= 0:
raise ValueError("Affinity scaling (aff_scale) shoud > 0.")

if metric not in ("cosine", "euclidean"):
raise ValueError("Metric only supports cosine and euclidean.")

self._sdm = SampleDistanceMatrix(x=x, d=d, metric=metric,
use_pdist=use_pdist,
sids=sids, fids=fids, nprocs=nprocs)
if graph is None:
knn_conn_mat = self._sdm.s_knn_connectivity_matrix(
k=k, use_pca=use_pca, use_hnsw=use_hnsw,
index_params=index_params, query_params=query_params,
verbose=verbose)
graph = SampleDistanceMatrix.knn_conn_mat_to_aff_graph(
knn_conn_mat, aff_scale=aff_scale)

if partition_method == "RBConfigurationVertexPartition":
la_part_cls = la.RBConfigurationVertexPartition
elif partition_method == "RBERVertexPartition":
la_part_cls = la.RBERVertexPartition
elif partition_method == "CPMVertexPartition":
la_part_cls = la.CPMVertexPartition
elif partition_method == "SignificanceVertexPartition":
la_part_cls = la.SignificanceVertexPartition
elif partition_method == "SurpriseVertexPartition":
la_part_cls = la.SurpriseVertexPartition
else:
raise ValueError(
"Unknown partition method: {}".format(partition_method))

la_res = la.find_partition(graph, la.RBConfigurationVertexPartition,
seed=random_state, weights='weight',
resolution_parameter=resolution)
# keep track of results and parameters
self._graph = graph
self._la_res = la_res
self._labs = la_res.membership
self._k = k
self._use_pca = use_pca
self._use_hnsw = use_hnsw
self._index_params = index_params
self._query_params = query_params
self._aff_scale = aff_scale

@property
def labs(self):
return self._labs.copy()
33 changes: 22 additions & 11 deletions scedar/eda/sdm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@
from collections import defaultdict

import networkx as nx

from fa2 import ForceAtlas2

import igraph as ig

from umap import UMAP

import nmslib
Expand Down Expand Up @@ -760,7 +763,6 @@ def s_knn_ind_lut(self, k):
knn_order_ind_lut[ikey] = [t[0] for t in d_sorted_v[0:k]]
return knn_order_ind_lut

# TODO: record as attribute
def s_knn_connectivity_matrix(self, k, metric=None, use_pca=False,
use_hnsw=False,
index_params=None, query_params=None,
Expand All @@ -785,14 +787,14 @@ def s_knn_connectivity_matrix(self, k, metric=None, use_pca=False,
index_params: dict
Parameters used by HNSW in indexing.
efConstruction: int
Default 100. Higher value improves the quality of a constructed graph and
leads to higher accuracy of search. However this also leads to
longer indexing times. The reasonable range of values is
100-2000.
Default 100. Higher value improves the quality of a constructed
graph and leads to higher accuracy of search. However this also
leads to longer indexing times. The reasonable range of values
is 100-2000.
M: int
Default 5. Higher value leads to better recall and shorter retrieval
times, at the expense of longer indexing time. The reasonable
range of values is 5-100.
Default 5. Higher value leads to better recall and shorter
retrieval times, at the expense of longer indexing time. The
reasonable range of values is 5-100.
delaunay_type: {0, 1, 2, 3}
Default 2. Pruning heuristic, which affects the trade-off
between retrieval performance and indexing time. The default
Expand Down Expand Up @@ -846,7 +848,6 @@ def _s_knn_conn_mat_hnsw(self, k, metric=None, use_pca=False,
if metric is None:
metric = self._metric


if use_pca:
data_x = self._pca_x
is_sparse = False
Expand Down Expand Up @@ -912,7 +913,8 @@ def _s_knn_conn_mat_hnsw(self, k, metric=None, use_pca=False,
knn_con_mat = spsparse.coo_matrix(
(knn_weights, (knn_sources, knn_targets)),
shape=(data_x.shape[0], data_x.shape[0]))
return knn_con_mat
knn_con_csr = knn_con_mat.tocsr()
return knn_con_csr

def _s_knn_conn_mat_skl(self, k, metric=None, use_pca=False,
verbose=False):
Expand Down Expand Up @@ -944,6 +946,16 @@ def _s_knn_conn_mat_skl(self, k, metric=None, use_pca=False,
metric=metric,
include_self=False)
return knn_conn_mat

@staticmethod
def knn_conn_mat_to_aff_graph(knn_conn_mat, aff_scale=1):
sources, targets = knn_conn_mat.nonzero()
weights = knn_conn_mat[sources, targets].A1
weights = (weights.max() - weights) * aff_scale
graph = ig.Graph(edges=list(zip(sources, targets)),
directed=False, edge_attrs={"weight": weights})
return graph


def s_knn_graph(self, k, gradient=None, labels=None,
different_label_markers=True, aff_scale=1,
Expand Down Expand Up @@ -989,7 +1001,6 @@ def s_knn_graph(self, k, gradient=None, labels=None,
if knn_ng_param_key in self._knn_ng_lut:
ng = self._knn_ng_lut[knn_ng_param_key]
else:
# TODO: support sparse matrix
knn_d_csr = self.s_knn_connectivity_matrix(k)
# affinity shoud negatively correlate to distance
knn_d_csr.data = (knn_d_csr.max() - knn_d_csr.data) * aff_scale
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ test=pytest

[tool:pytest]
addopts = --verbose
filterwarnings = ignore:.*(imp module|SafeConfigParser):DeprecationWarning
51 changes: 51 additions & 0 deletions tests/test_cluster/test_community.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest

import numpy as np

import sklearn.datasets as skdset
from sklearn import metrics

import scedar.cluster as cluster
import scedar.eda as eda


class TestCommunity(object):
'''docstring for TestMIRAC'''
np.random.seed(123)
x_20x5 = np.random.uniform(size=(20, 5))

def test_simple_run(self):
cluster.Community(self.x_20x5).labs

def test_wrong_args(self):
with pytest.raises(ValueError):
cluster.Community(self.x_20x5, aff_scale=-0.1).labs
with pytest.raises(ValueError):
cluster.Community(self.x_20x5, metric='123').labs
with pytest.raises(ValueError):
cluster.Community(self.x_20x5, metric='correlation').labs
with pytest.raises(ValueError):
cluster.Community(
self.x_20x5, partition_method='NotImplementedMethod').labs

def test_different_partition_methods(self):
cluster.Community(
self.x_20x5,
partition_method="RBConfigurationVertexPartition").labs
cluster.Community(
self.x_20x5, partition_method="RBERVertexPartition").labs
cluster.Community(
self.x_20x5, partition_method="CPMVertexPartition").labs
cluster.Community(
self.x_20x5, partition_method="SignificanceVertexPartition").labs
cluster.Community(
self.x_20x5, partition_method="SurpriseVertexPartition").labs

def test_provide_graph(self):
sdm = eda.SampleDistanceMatrix(self.x_20x5)
knn_conn_mat = sdm.s_knn_connectivity_matrix(5)
knn_aff_graph = eda.SampleDistanceMatrix.knn_conn_mat_to_aff_graph(
knn_conn_mat, 2)
cluster.Community(
self.x_20x5, graph=knn_aff_graph,
partition_method="RBConfigurationVertexPartition").labs
2 changes: 1 addition & 1 deletion tests/test_eda/test_slcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,7 @@ def test_encode_mdl(self):
emdl = mdl_slcs.encode(np.arange(100).reshape(-1, 5))
emdl2 = mdl_slcs.encode(np.arange(100).reshape(-1, 5), nprocs=2)

assert emdl == emdl2
np.testing.assert_approx_equal(emdl, emdl2)

emdl3 = eda.MDLSingleLabelClassifiedSamples(
self.x50x5, mdl_method=eda.mdl.GKdeMdl, labs=self.labs50,
Expand Down

0 comments on commit c938bea

Please sign in to comment.