Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new hierachical clustering tool based on scipy #6214

Merged
merged 9 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions tools/clustering_from_distmat/clustering_from_distmat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import argparse
import sys

import scipy


def linkage_as_newick(linkage, tip_names):
newick_parts = tip_names[::]
within_cluster_distances = [0]*len(tip_names)
for step in linkage:
n1 = int(step[0])
n2 = int(step[1])
d = float(step[2])
d1 = d - within_cluster_distances[n1]
d2 = d - within_cluster_distances[n2]
id1 = newick_parts[n1]
id2 = newick_parts[n2]
part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})'
within_cluster_distances.append(d)
newick_parts.append(part)
return newick_parts[-1].format(*newick_parts) + ';'


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'infile',
help='Distance matrix input file'
)
parser.add_argument(
'out_prefix',
help="Output prefix"
)
parser.add_argument
parser.add_argument(
'-m', '--method', default="average",
choices = [
"single",
"complete",
"average",
"weighted",
"centroid",
"median",
"ward"
],
help="Clustering method to use"
)
cut_mode = parser.add_mutually_exclusive_group()
cut_mode.add_argument(
"-n", "--n-clusters", nargs="*", type=int
)
cut_mode.add_argument(
"--height", nargs="*", type=float
)
args = parser.parse_args()

# TO DO:
# - parse outputs to generate

# read from input and check that
# we have been passed a symmetric distance matrix
with open(args.infile) as i:
col_names = next(i).rstrip("\n\r").split("\t")[1:]
col_count = len(col_names)
if not col_count:
sys.exit(
'No data columns found. '
'This tool expects tabular input with column names on the first line '
'and a row name in the first column of each row followed by data columns.'
)
row_count = 0
matrix = []
for line in i:
if not line.strip():
# skip empty lines
continue
row_count += 1
if row_count > col_count:
sys.exit(
'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
'but got more rows than columns.'
)
row_name, *row_data = line.strip(" \n\r").split("\t")
col_name = col_names[row_count - 1]
if not row_name:
# tolerate omitted row names, use col name instead
row_name = col_name
if row_name != col_name:
sys.exit(
'This tool expects a symmetric distance matrix with identical names for rows and columns, '
f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.'
)
if len(row_data) != col_count:
sys.exit(
'This tool expects a symmetric distance matrix with the same number of columns on each row, '
f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.'
)
try:
matrix.append([float(x) for x in row_data])
except ValueError as e:
sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
if row_count < col_count:
sys.exit(
'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
'but got more columns than rows.'
)

# turn the distance matrix into "condensed" vector form
# this gives us further checks and raises ValueErrors if:
# - the values on the diagonal aren't zero
# - the upper and lower triangle of the matrix aren't identical
D = scipy.spatial.distance.squareform(matrix)

# perform the requested clustering and retrieve the result as a linkage object
linkage = scipy.cluster.hierarchy.linkage(D, args.method)

with open(args.out_prefix + '.tree.newick', 'w') as o:
o.write(linkage_as_newick(linkage, col_names))

if args.n_clusters or args.height:
cluster_assignments = []
for name, cluster_ids in zip(
col_names,
scipy.cluster.hierarchy.cut_tree(
linkage,
args.n_clusters,
args.height
)
):
cluster_assignments.append(
[name]
+ [str(c + 1) for c in cluster_ids]
)
with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o:
for ass in cluster_assignments:
print("\t".join(ass), file=o)
102 changes: 102 additions & 0 deletions tools/clustering_from_distmat/clustering_from_distmat.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.0" profile="23.0">
<description>using Scipy</description>
<requirements>
<requirement type="package" version="3.12">python</requirement>
<requirement type="package" version="1.14.0">scipy</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/clustering_from_distmat.py'
'$distmat'
result
--method $method
#if str($cluster_assignment.select) == 'n-cluster':
--n-clusters $cluster_assignment.n_cluster
#elif str($cluster_assignment.select) == 'height':
--height $cluster_assignment.height
#end if
]]></command>
<inputs>
<param name="distmat" type="data" format="tabular" label="Distance matrix" />
<param name="method" type="select" label="Clustering method">
<option value="single">Nearest Point (scipy 'single' method)</option>
<option value="complete">Farthest Point (scipy 'complete' method)</option>
<option value="average" selected="true">UPGMA (scipy 'average' method)</option>
<option value="weighted">WPGMA (scipy 'weighted' method)</option>
<option value="centroid">UPGMC (scipy 'centroid' method)</option>
<option value="median">WPGMC (scipy 'median' method)</option>
<option value="ward">Ward/Incremental (scipy 'ward' method)</option>
</param>
<conditional name="cluster_assignment">
<param name="select" type="select" label="Generate cluster assignments?">
<option value="dendrogram-only">No, just generate the dendrogram of clustering results</option>
<option value="n-cluster">Yes, and divide into specified number of clusters </option>
<option value="height">Yes, and use distance threshold to divide into clusters</option>
</param>
<when value="dendrogram-only" />
<when value="n-cluster">
<param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" />
<param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
</when>
<when value="height">
<param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" />
<param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
bgruening marked this conversation as resolved.
Show resolved Hide resolved
</when>
</conditional>
</inputs>
<outputs>
<data name="clustering_dendrogram" format="newick" from_work_dir="result.tree.newick" label="${tool.name} on ${on_string}: Dendrogram">
<filter>cluster_assignment["select"] == "dendrogram-only" or cluster_assignment["generate_dendrogram"]</filter>
</data>
<data name="clustering_assignment" format="tabular" from_work_dir="result.cluster_assignments.tsv" label="${tool.name} on ${on_string}: Cluster assignment">
<filter>cluster_assignment["select"] in ["n-cluster", "height"]</filter>
</data>
</outputs>
<tests>
<test expect_num_outputs="1">
<param name="distmat" value="test_matrix.tsv"/>
<output name="clustering_dendrogram" ftype="newick" file="test_tree_average.newick" />
</test>
<test expect_num_outputs="1">
<param name="distmat" value="test_matrix.tsv" />
<param name="method" value="complete" />
<output name="clustering_dendrogram" ftype="newick" file="test_tree_complete.newick" />
</test>
<test expect_num_outputs="1">
<param name="distmat" value="test_matrix.tsv"/>
<conditional name="cluster_assignment">
<param name="select" value="height" />
<param name="height" value="18" />
</conditional>
<output name="clustering_assignment" ftype="tabular" file="test_assignment_average_18.tsv" />
</test>
<test expect_num_outputs="2">
<param name="distmat" value="test_matrix.tsv"/>
<conditional name="cluster_assignment">
<param name="select" value="height" />
<param name="n_cluster" value="4" />
</conditional>
<output name="clustering_assignment" ftype="tabular" file="test_assignment_average_18.tsv" />
</test>
</tests>
<help><![CDATA[

.. class:: infomark

**What it does**

This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it.

As input it expects a symmetrical distance matrix with sample names on the first row and in the first column.

The clustering result can be reported in the form of a dendrogram in newick format.

Additionally, the tool can report the assignment of the samples to clusters "cut" from the clustering tree using the `scipy.cluster.hierarchy.cut_tree`_ function.
Reflecting the parameters of that function, you can specify *how* to cut the tree by specifying either the number of clusters to cut into or a distance threshold, i.e., the height at which to cut the tree as SciPy calls it.

.. _`scipy.cluster.hierarchy.linkage`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
.. _`scipy.cluster.hierarchy.cut_tree`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cut_tree.html
]]></help>
<citations>
<citation type="doi">10.1038/s41592-019-0686-2</citation>
</citations>
</tool>
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
a 1
b 1
c 2
d 3
e 4
6 changes: 6 additions & 0 deletions tools/clustering_from_distmat/test-data/test_matrix.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
a b c d e
a 0 17 21 31 23
b 17 0 30 34 21
c 21 30 0 28 39
d 31 34 28 0 43
e 23 21 39 43 0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
((e:11.0,(a:8.5,b:8.5):2.5):5.5,(c:14.0,d:14.0):2.5);
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
((e:11.5,(a:8.5,b:8.5):3.0):10.0,(c:14.0,d:14.0):7.5);
Loading