-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add community/Louvain/Leiden clustering method
Used community detection / clustering implementations from vtraag/leidenalg, https://github.com/vtraag/leidenalg.
- Loading branch information
Showing
6 changed files
with
242 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from scedar.cluster.mirac import MIRAC | ||
from scedar.cluster.community import Community | ||
|
||
__all__ = ["mirac"] | ||
__all__ = ["mirac", "community"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
import numpy as np | ||
|
||
from scedar.eda import SampleDistanceMatrix | ||
from scedar.eda.slcs import SingleLabelClassifiedSamples as SLCS | ||
from scedar import utils | ||
|
||
import leidenalg as la | ||
|
||
class Community(object): | ||
""" | ||
Community clustering | ||
Args | ||
---- | ||
x : float array | ||
Data matrix. | ||
d : float array | ||
Distance matrix. | ||
graph: igraph.Graph | ||
Need to have a weight attribute as affinity. If this argument | ||
is not None, the graph will directly be used for community | ||
clustering. | ||
metric: {'cosine', 'euclidean'} | ||
Metric used for nearest neighbor computation. | ||
sids : sid list | ||
List of sample ids. | ||
fids : fid list | ||
List of feature ids. | ||
use_pdist : boolean | ||
to use the pairwise distance matrix or not. The pairwise distance | ||
matrix may be too large to save for datasets with a large number of | ||
cells. | ||
k : int | ||
The number of nearest neighbors. | ||
use_pca : bool | ||
Use PCA for nearest neighbors or not. | ||
use_hnsw : bool | ||
Use Hierarchical Navigable Small World graph to compute | ||
approximate nearest neighbor. | ||
index_params : dict | ||
Parameters used by HNSW in indexing. | ||
efConstruction : int | ||
Default 100. Higher value improves the quality of a constructed | ||
graph and leads to higher accuracy of search. However this also | ||
leads to longer indexing times. The reasonable range of values | ||
is 100-2000. | ||
M : int | ||
Default 5. Higher value leads to better recall and shorter | ||
retrieval times, at the expense of longer indexing time. The | ||
reasonable range of values is 5-100. | ||
delaunay_type : {0, 1, 2, 3} | ||
Default 2. Pruning heuristic, which affects the trade-off | ||
between retrieval performance and indexing time. The default | ||
is usually quite good. | ||
post : {0, 1, 2} | ||
Default 0. The amount and type of postprocessing applied to the | ||
constructed graph. 0 means no processing. 2 means more | ||
processing. | ||
indexThreadQty : int | ||
Default self._nprocs. The number of threads used. | ||
query_params : dict | ||
Parameters used by HNSW in querying. | ||
efSearch : int | ||
Default 100. Higher value improves recall at the expense of | ||
longer retrieval time. The reasonable range of values is | ||
100-2000. | ||
aff_scale : float > 0 | ||
Scaling factor used for converting distance to affinity. | ||
Affinity = (max(distance) - distance) * aff_scale. | ||
partition_method : str | ||
Following methods are implemented in leidenalg package: | ||
- RBConfigurationVertexPartition: only well-defined for positive edge | ||
weights. | ||
- RBERVertexPartition: well-defined only for positive edge weights. | ||
- CPMVertexPartition: well-defined for both positive and negative edge | ||
weights. | ||
- SignificanceVertexPartition: well-defined only for unweighted graphs. | ||
- SurpriseVertexPartition: well-defined only for positive edge weights. | ||
resolution : float > 0 | ||
Resolution used for community clustering. Higer value produces more | ||
clusters. | ||
random_state : int | ||
Random number generator seed used for community clustering. | ||
n_iter : int | ||
Number of iterations used for community clustering. | ||
nprocs : int > 0 | ||
The number of processes/cores used for community clustering. | ||
verbose : bool | ||
Print progress or not. | ||
Attributes | ||
---------- | ||
labs : label list | ||
Labels of clustered samples. 1-to-1 matching to | ||
from first to last. | ||
_sdm : SampleDistanceMatrix | ||
Data and distance matrices. | ||
_graph : igraph.Graph | ||
Graph used for clustering. | ||
_la_res : leidenalg.VertexPartition | ||
Partition results computed by leidenalg. | ||
_k | ||
_use_pca | ||
_use_hnsw | ||
_index_params | ||
_query_params | ||
_aff_scale | ||
""" | ||
|
||
def __init__(self, x, d=None, graph=None, | ||
metric="cosine", sids=None, fids=None, | ||
use_pdist=False, k=15, use_pca=True, use_hnsw=True, | ||
index_params=None, query_params=None, aff_scale=1, | ||
partition_method="RBConfigurationVertexPartition", | ||
resolution=1, random_state=None, n_iter=2, | ||
nprocs=1, verbose=False): | ||
super().__init__() | ||
if aff_scale <= 0: | ||
raise ValueError("Affinity scaling (aff_scale) shoud > 0.") | ||
|
||
if metric not in ("cosine", "euclidean"): | ||
raise ValueError("Metric only supports cosine and euclidean.") | ||
|
||
self._sdm = SampleDistanceMatrix(x=x, d=d, metric=metric, | ||
use_pdist=use_pdist, | ||
sids=sids, fids=fids, nprocs=nprocs) | ||
if graph is None: | ||
knn_conn_mat = self._sdm.s_knn_connectivity_matrix( | ||
k=k, use_pca=use_pca, use_hnsw=use_hnsw, | ||
index_params=index_params, query_params=query_params, | ||
verbose=verbose) | ||
graph = SampleDistanceMatrix.knn_conn_mat_to_aff_graph( | ||
knn_conn_mat, aff_scale=aff_scale) | ||
|
||
if partition_method == "RBConfigurationVertexPartition": | ||
la_part_cls = la.RBConfigurationVertexPartition | ||
elif partition_method == "RBERVertexPartition": | ||
la_part_cls = la.RBERVertexPartition | ||
elif partition_method == "CPMVertexPartition": | ||
la_part_cls = la.CPMVertexPartition | ||
elif partition_method == "SignificanceVertexPartition": | ||
la_part_cls = la.SignificanceVertexPartition | ||
elif partition_method == "SurpriseVertexPartition": | ||
la_part_cls = la.SurpriseVertexPartition | ||
else: | ||
raise ValueError( | ||
"Unknown partition method: {}".format(partition_method)) | ||
|
||
la_res = la.find_partition(graph, la.RBConfigurationVertexPartition, | ||
seed=random_state, weights='weight', | ||
resolution_parameter=resolution) | ||
# keep track of results and parameters | ||
self._graph = graph | ||
self._la_res = la_res | ||
self._labs = la_res.membership | ||
self._k = k | ||
self._use_pca = use_pca | ||
self._use_hnsw = use_hnsw | ||
self._index_params = index_params | ||
self._query_params = query_params | ||
self._aff_scale = aff_scale | ||
|
||
@property | ||
def labs(self): | ||
return self._labs.copy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import pytest | ||
|
||
import numpy as np | ||
|
||
import sklearn.datasets as skdset | ||
from sklearn import metrics | ||
|
||
import scedar.cluster as cluster | ||
import scedar.eda as eda | ||
|
||
|
||
class TestCommunity(object): | ||
'''docstring for TestMIRAC''' | ||
np.random.seed(123) | ||
x_20x5 = np.random.uniform(size=(20, 5)) | ||
|
||
def test_simple_run(self): | ||
cluster.Community(self.x_20x5).labs | ||
|
||
def test_wrong_args(self): | ||
with pytest.raises(ValueError): | ||
cluster.Community(self.x_20x5, aff_scale=-0.1).labs | ||
with pytest.raises(ValueError): | ||
cluster.Community(self.x_20x5, metric='123').labs | ||
with pytest.raises(ValueError): | ||
cluster.Community(self.x_20x5, metric='correlation').labs | ||
with pytest.raises(ValueError): | ||
cluster.Community( | ||
self.x_20x5, partition_method='NotImplementedMethod').labs | ||
|
||
def test_different_partition_methods(self): | ||
cluster.Community( | ||
self.x_20x5, | ||
partition_method="RBConfigurationVertexPartition").labs | ||
cluster.Community( | ||
self.x_20x5, partition_method="RBERVertexPartition").labs | ||
cluster.Community( | ||
self.x_20x5, partition_method="CPMVertexPartition").labs | ||
cluster.Community( | ||
self.x_20x5, partition_method="SignificanceVertexPartition").labs | ||
cluster.Community( | ||
self.x_20x5, partition_method="SurpriseVertexPartition").labs | ||
|
||
def test_provide_graph(self): | ||
sdm = eda.SampleDistanceMatrix(self.x_20x5) | ||
knn_conn_mat = sdm.s_knn_connectivity_matrix(5) | ||
knn_aff_graph = eda.SampleDistanceMatrix.knn_conn_mat_to_aff_graph( | ||
knn_conn_mat, 2) | ||
cluster.Community( | ||
self.x_20x5, graph=knn_aff_graph, | ||
partition_method="RBConfigurationVertexPartition").labs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters