diff --git a/.github/workflows/cli_vamb.yml b/.github/workflows/cli_vamb.yml index 1f74ae2a..9018429d 100644 --- a/.github/workflows/cli_vamb.yml +++ b/.github/workflows/cli_vamb.yml @@ -28,7 +28,7 @@ jobs: cache-dependency-path: '**/pyproject.toml' - name: Download fixtures run: | - wget https://www.dropbox.com/scl/fi/xzc0tro7oe6tfm3igygpj/ci_data.zip\?rlkey\=xuv6b5eoynfryp4fba1kfp5jm\&st\=9oz24ych\&dl\=0 -O ci_data.zip + wget https://www.dropbox.com/scl/fi/xzc0tro7oe6tfm3igygpj/ci_data.zip\?rlkey\=xuv6b5eoynfryp4fba1kfp5jm\&st\=rjb1xccw\&dl\=0 -O ci_data.zip unzip -o ci_data.zip - name: Install dependencies run: | @@ -61,6 +61,6 @@ jobs: cat outdir_taxometer/log.txt - name: Run k-means reclustering run: | - vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --hmmout_path markers_mock.hmmout --algorithm kmeans --minfasta 200000 + vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --markers markers_mock.npz --algorithm kmeans --minfasta 200000 ls -la outdir_recluster cat outdir_recluster/log.txt diff --git a/pyproject.toml b/pyproject.toml index 65452776..82983def 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ dependencies = [ "pycoverm == 0.6.0", "networkx == 3.2", # 3.3 drops Python 3.9 support "scikit-learn == 1.5.0", - "pandas == 2.2.0", "dadaptation == 3.2", "loguru == 0.7.2", "pyhmmer == 0.10.12", diff --git a/test/data/marker.fna b/test/data/marker.fna new file mode 100644 index 00000000..e929c753 --- /dev/null +++ b/test/data/marker.fna @@ -0,0 +1,2 @@ +>abc +GTGGTGAGTGCTTGGACGTGTGTGGTGATGCTCATGGTGTCTTCCCCCCTTCCCCGTGTTGTTACCACCCACTCTACCCGCTCACACTTTCGAATACAAGGGTATTTTTCGAAATGGAAAAGCTGTCGAGCACTCACATGTTCGACAGCTTCATATGTCCTTGAACTACGTAACTTTTTCCTTACAATCAGCGCAAAGTCCAAAGATTTCGGCCTCGTGACTACTGAGAGCAAAGCCGTTTTTAGTGGCAATTTCCTGTGCCCATGTCTCTACTGGACCGCCATCGATTTCGACTGTGTGACCGCAATTGGTGCAGACCAAGTGATGGTGGTGTCCCTCGTCGTGGCATTGGCGGTACAGAGTTTCTCCACCTGTGACGGTAAGTACGTCGACTGCCCTGATGTCCGCGAGGGATTGGAGGGTTCGGTAGACGGTTGTGAGGCCGACGTTGTGTTCCCTGGTGGATAGCTCGTGATGGATTTCTTTGGCGGAAGCGAAGTTATCGATTTCCTCAAGAACGTCAATTACGGCTTTTCGCTGTCTGGTGCTTCGCACTCCCAGCTTCGGGGCAGAGCCTTGGCTGATGCGATTGATACCCACCGTTGATCCTCCTCAATGACACAAAATGTACTTCGATAGTCTACCCAGATGTGTCAACCCCTGCGTTTAGTGCCAGGAGAAGTATGTCGAGGACGAGTGGTTCGGCAAGTGAATAAGTCATTTGCCGGCCTTGACGTTCTGCGTCGACGATACCTGCAGTTTTAAGGACTTTGAGGTGTTGGCTCACTAGTGGTTGCGAACTTTTTACTAGCTTGACCAATTCGTGGACGTAGTGGGGCCTTTCGTTGAGGGCGAGGATGATTTCGATTCTTAAGGGGGAATCTAGTGCCCTAATCAGCAGGCTGATCGCTTTGATGTTTTTTGCTGTTGCAAGTTTCTGAAGCTCAGCTGATGCTGTGGATTCGGACTCTTCTGCAGGGGTGACGAAATTCCGATTTGAGTGTTGAGCCACGGGGAAGTCCTTCCGTCCTTAGGCTAGGTCTGGAATGGATCTAGCACGCTTGCTATTTTACCTTCTATATAAACCTTTTATGAGGGAAATGAAAAAATAGTTATTAGAACTAGTTTACATCGCGAAGGCCGCAAAATGACGGGGTCAGCGGAAGCAACATCGTTAGTTGGGCTAGGATTGGTTGGGTATGTCCTAAAAGGGACGGTTATTTTTTCATTCGACGTGGAGGAGAGCATCCGACGTGGCTCAGCAATCGATCATCGACACCGTGGTTAACCTGTGTAAACGACGTGGACTGGTGTACCCCTGTGGTGAGATCTACGGCGGTACCCGCTCTGCGTGGGACTACGGCCCGCTGGGTGTGGAGCTGAAGGAAAACATCAAGCGCCAGTGGTGGCGTTCTATGGTTACTTCCCGCCCAGATGTTGTGGGTGTTGATACTTCTGTCATCCTTCCTCGCCAGGTGTGGGTAACTTCCGGCCACGTTGAGGTCTTCACTGACCCACTGGTTGAGTCTTTGAACACCCACAAGCGTTACCGTGCGGACCACCTGCTGGAGCAGTACGAAGAGAAGCATGGTCACCCACCTGTAAACGGCTTGGCTGACATCAACGATCCAGAGACCGGCCAGCCAGGTAACTGGACTGAGCCTAAGGCGTTCTCTGGTCTTCTGAAGACTTTCTTGGGACCTGTGGACGACGAAGAGGGTCTGCACTACCTGCGCCCTGAAACTGCTCAGGGTATCTTCGTGAACTTCAAGAACGTGATGAACACTTCACGCATGAAGCCACCTTTCGGTATCGCGAACATCGGTAAGTCTTTCCGTAACGAGATCACCCCAGGTAACTTCATTTTCCGTACTCGTGAGTTCGAGCAGATGGAGATGGAGTTCTTCGTCAAGCCTGGTGAGGACGAAGAGTGGCACCAGCACTGGATTGATACTCGCCTGCAGTGGTACATCAACCTGGGCATTAAGCCTGAGAACCTGCGTCTGTACGAGCACCCTCAGGAGAAGCTGTCTCACTACTCCAAGCGCACTGTTGATATTGAGTACGCATTCAACTTTGCTAACACCAAGTGGGGCGAGTTAGAGGGTATCGCGAACCGTACTGATTACGATCTTCGCGTGCACTCTGAGGGCTCTGGTGAGGACCTGTCATTCTTCGATCAGGAGACCAATGAGCGTTGGATTCCTTTCGTAATCGAGCCTGCTGCAGGTCTTGGTCGCGCAATGATGATGTTCCTGATGGATGCTTATCACGAGGACGAGGCACCAAACTCAAAGGGTGGCGTCGATAAGCGTGTTGTTCTGAAGCTTGACCGTCGCCTTGCGCCGGTTAAGGTTGCGGTCTTGCCGCTGTCAAAGAAGGACACTTTGACGCCTTTGGCGGAAAAGCTCGCAGCAGAGCTGCGTGAATTCTGGAACGTTGATTACGACACTTCAGGTGCGATTGGTCGCCGTTACCGTCGTCAGGACGAGATCGGTACTCCATTCTGCGTCACCGTTGACTTTGATTCTCTCGAGGACAACGCTGTGACCGTGCGTGAGCGCGACACCATGGAGCAGGTTCGTGTTCCACTTGATGAGCTGCAGGGTTACTTGGCTCAGCGCCTCATCGGCTGCTAAACGGCAACCAATAGAGCGATAATTCGCTAAGACGAATGTAATCGCAGCAACATATAGCACCGGCTTAACAGGCCGGTGCTATTCTGTTCGCATGACTTCGAAGGATCTGATTGTGACCTCCTATACGTCTTGGGGCAAGCGTTTCAAGAATGACGGGAAGCTTTTTATTAACCTACTTCGCAGCACCACTGATAGTGCTGATGAAAAGGTTTTAGCCACTTTCGGTGAAGTTCCCAGCAAATCATTTGAAACCACCGCAACGGTTGATGAGCAGCAGTGGGAACTGTCCTTCAATATTGATGGAACGGCAACTGCCAAGCTTCCTGATGGTCGTGTGTTCAGCGCGAATGCAGGTGAGAAGACCTTTACCAAGTCCAAGCGGATTGAAATCGACATGGACGGCACCGCGATGGCTGCTGTTAATGAAGATAAAAACAATTGGATTATCGACGATTCTGAAGAGAATAAAGTCGCTCAGTTTACCGGTATGAACAACGGTGTGCGTCGCGCGATTGTGGAGTTTGAGCCTGACGTAGAAGTCACCCAGGAGCAGGAAATTTTCTTGTCGTGGGTTGCTCGGAAAACTCTGGAATCCCGCATGTTGGGCTCCAGTTGGGGACTGACTCTGTTTTTGATCATTTTGACGCCAATCATTATTTTTCTCACTTTCAGCTAAAAGGACCATGCAATGGTAGACGCTCAGCGCCCCAAAGCAGGCATCTTCGGTAGCCACACAGAAGAAACATGGGTGTGGCTCGGTAATGAACTTTTCGACGAGTCCGGCGAGGTCATCGCCGACGTTCGCTCCGACGTCCTCTACGTGGATCGCGAACGACTACTCATCGAATCCACCCCCGGCACCATGCGTTTTCGTTGCCGCGCAACACTGTCCGGGGGTGAGGTCTATACAATGACTCAGAATTCTTTCACTGTGGGGGATCTCACTGCGGTGTGCGGGCGCCGGACGTATTCGCTAAAAAGGGTGTCGCCGTGGCGTAAAGAACGCCTGATCACCAACAATGGGGTGGAAGTGGCGCGACTTCGCCCGATGACCAGCGGTAAAGTCGAATTCATTGTGGGCACCGCGGACAGCGAGGCGTTGCCGTTCGTCGACGCAGTATTTTTGAGCTGGGCGTGCGTCCTGGTGGATTCGGCCGTGCGCCGGCCGAAAATTTAAAAGCTTTTTGCTTATCGACGCACCCCTCCACCTGTTTTTTGTAGCCGGGGGATCATTTCCTTTGAAGGATCCAATCTCCGCACTTAGTTTCCTTCGGTGTGAAGGAAAGAGTTCCGTAAAGACCTCTATCTCATTTAAAGAAGTGGAGGATTAGGGTCGTTGACTCGCCTTCGGCACTAATTTGAGCCAAGTTCAAGTTTGCTGCCATCCCAGGTGACCGAAAATGTCCTATGCGAGGTCTCTTCGGTCACTTGGTTTTGCTCGTTTCAGGCTAGAAGCGGCCTCCGCGGAACCCTCCTCCGCCACCGCCACCACCGCTGAAGCCGCCACCGCCTCCACCGAAGCCTCCACCGAAACCGCCACCGCGGCCGCTGTTGAGAATCGAGTTGATCACCATGCCGGTGACAATCGCACCGGTGGTTCCGCCACCGGAATTGTGGCGATTGTTGTAGGTGGTGATGTCGTTTTGTGCTGACTTGCTGGCGCGTTGGGCTGCGACTGCTGCTTGACGTCCGTAATCAATTCCTGCACGGGTGTCGCGGGTGCGGTTTTGTTGTGCCATGGCGTACAGTTTTTGTGCGTTGGCCAGGTGGGTGCGGGCTTCGGATTTTACGATGCGACCGCGGGTGGAGATGAGGTCTTCGGCCTTTTGGATTTGGCTTCTTGCAGATTGCAGCTGTTGGTCGAATACGCGTAGCTGGCGGGCTTGATCAGCTGCGGTGGCGCGAAGTGTGTCAAGTTGAGTGTCGAGGGCGGAGTCGACATCGACAAGTTCTGTGTAGGTTCCGAGCGGATCCTTTTCGGCGTCTGCTGATGCGGTGGTTAGTGCTGCGCTGGCTGCGCGGACAGCATCGTCGAGGGAGGCCCAGTCGGCACGGGCACCGTCGGCTCCTGCGCTTTGTTTGAGTTGGCCGGCTTCGTTGATTTCGTCTGAGATTTCTTGAATCAGATCGGCAACGTTTGCTTTGGCTGTGGAGATGTTTTCATCGGCATGCTCGACGCCCTCGAGGAGTTTGTCTGCGGTAGTGATGGCGTGCTCGATGTGACGGATCGCGTCGATAAGCCCGCCCTGCTCGCCTGCGGGCATGGACTCTATCTTGTACGCCTGTGGCAGGACTTCTTCTGCTTCGTCGAGCGAAGCGCTGGCGAGGTCGACGTTGTCGTCGATGCTTTCAAGGACCTCTGCTGAGTAGCGAGCGCGCAGGCCAGCGAGTGTTTCTTGAGCCTTGGGGAGGCGGGTGCGCAGGTCGACGGATTTTTGGGTGAGAGCATCCAATTTGCTGCCCGCGTTGATCAGCAGGTTGCGCATATCGGCAAAGTTTTGGGCCTCGGCGTCGAGGGCATCGTCGGCTTGGCCACAGGATGAAATGATTTCTACCAGCATGGATCGACGTTCGGCTTCGGATTCTGGGATAGAATCGTTGAGGCGCTGCTGAATCTCAAAGGCTTTTTGCAGGGTGCCGGTGGAGTGGTTCATGGCGCGGTTGAAGCTGCGGGTGCGCTCTGGTCCGAACTCGGAGGTAGCGATAGCGAGCTCTTCTTTTCCGCGACGGATGGAGTCATCAGTGGAGGTGAGCTCTTCTTGGGCAAGGTGTTCGAGAGTTTCCATGGGAAGCTGCATGAGGCGGTTGGTATCGCGAGGGTCGATCTCACGTGCATCTTCCAAGGTTGCAGCACTTGTTTTCTTCTTGCGGCTGCGGGAATAGGCCCAAATTCCGCCACCAGCGGCCACTGTGCCAACGCCCGCAGCAGCCAACCAAGCGCCGGAAGATCCAGAAGAGCCTGAGGTTCCTGAGGCACCAGAACTAGAACCAACTGATTCTGCCAGCGCTAGTGCGGAGCCTGCCCAATCTTCTTGGGAAAGCGCCTGGAAAGCAGCGTTGTTGGCGGCGTCGAGTTCAGCGTCGGTCCATTGAGTACCACCTTGGATGCCGTACTGCCGTTCCTCGGGAGCGAGTGCATAAACCAAGACGTTTCCGCCGCCGTTGGCTTGGAGTGCTTGCTGCGTCCACGTTTCAGGGTCAACTCCGTCGAAAGAGCTTAGGAAAACAACGAAAATAACCTTTTGTTCAGATGCCTTTACATCATCGATGGCAGCCTGAATGTTGGTGATATCGGACGAGGAAATCTGGCCGGTGTAGTCAGTGACATTGTCTTGGTAAAATTCTGGTGATTCAGCCAAGACATATGTTTCTGTGGCTTCTGCAGTGTGAGCAGTAAAAAATGGTCCACTGATAAGGAGCGCGCCAGCTCCAATTGCCACAGTGACCGATACACGGCGGACGTTTTCCCGAAGATGCACCAAACTAAAGTTCATGGTCCCCACCTTAGACGAGTCCAGCTGGCACACTAGTTAACGTGAGAAGATTTTTAGCCAAGAGTTTACTCTTAACCGCAGTAGCGCAACCAGCCCTGAGGGTGGTCGCGTATTCGATGCTCAGAACGCCTAATAATCGGCACAAAATTGATTCAATTTTGGTGTTGGGCACAGCTCAATATGATGGGGTTCCATCGAGGCAGTTTGCTGCTCGTTTGAGGCATGCCGCGAAGCTGTGGCGTCTTCATGAAATCCAGCATGTATATACTGTCGGCGGAAAACTTCCTGGTGATCGTTTCACCGAAGCAGAAGTCGCGCGGGAGTATTTGATCAAAGAGGGCGTGGATCCGGATCTGATTTTTGTCTCTGCAGTTGGCAATGACACTGTCTCCTCCTATGAGGCGCTTGATCCGGAAAAGCTTGGTCGGGTGCTGATTGTTACTGATCCGAACCATTCGTATCGGGCGGTGCGCATCGCGCGACGCATGGGCTTTGACGCGAAACCTTCCCCGACAACCTATAGTCCCGCGAAGTTTCCGTCGATAGTTTATTTTCTGACCTTGTCCCATGAGTGGGGCGGGGTAGTGGTACAGGACGTGTCGTGGCTCTTGGGCGAACGGGTGGCCGATAAGGTGGAAGCATCTTTGCGAACTATCCAAGGCCTGCTGCGCCCTTCGAGGCGTGCGCGCCATGAGCAACTTCGGAGGCTGAAAAAGTAGATGTACCCCTATTCCGACGCAGACGCTTTTCGACGCCACCCTGAGCGCGCCAAGTCCAGCCAACTGCGCACCAGCGCCGTAGACACCCGCAGCGCGTTCGCCCGCGACCGGGCTCGCGTGCTGCATTCTGCTGCTCTTCGACGCCTCGCGGATAAAACCCAAGTGGTTGGCCCCAATGATGGTGATACTCCGCGCACCCGGCTGACGCACTCTTTGGAAGTAGCTCAAATTGCACGGGGAATCGGAGCTGGACTGGATTTGGATCCTGATCTGTGCGATCTGGCAGGGCTGTGCCATGACATTGGGCATCCGCCGTATGGACACAACGGTGAAAACGCGTTGAATGAAGTTGCTGCGGCCTGTGGAGGATTTGAGGGCAACGCCCAAACCTTGCGTATTCTCACGCGGCTGGAGCCAAAAATTGTCTCTGATGAGGGGGAGAGCTTTGGGCTGAACTTGTCGCGGGCTGCTCTTGATGCTGCATGTAAGTATCCGTGGGCTAAAACAAATGCGGATGGCAGTGTCAATAAGAAATACAGTGCTTATGATGAGGACGCAGAAATTCTTGCTTGGATCAGGCAAGGCCATGAAGACCTCAGACCACCAATCGAAGCTCAGGTCATGGACTTTTCCGATGATATTGCCTACTCAGTACACGATGTAGAAGACGGCATTGTTTCCGGTCGCATCGATTTGAAAGTGCTGTGGGACCTGGTGGAATTAGCAGCACTGGCGGACAAAGGCGCAGCAGCTTTCGGAGGCTCGCCTGCAGAACTCATCGAGGGCGCAGCCTCGTTGCGGGAGCTTCCTGTGGTAGCGGCCGCTGCAGATTTTGATTTCTCACTGCGTTCCTACGCTGCGCTGAAGGCCATGACCTCAGAACTAGTGGGAAGATACGTTGGCTCTACCATCGAGTCAACAAAGAAAACACACGCCGGCATTGATGTGGGACGCATGCACGGCGATTTGATCATTCCAGAAACAGCGGCCAGTGAAGTAAAACTGCTCAAAACGTTAGCGGTTCTCTATGTGATGGATGACCCAGGGCACCTTGCGCGCCAAAACAGGCAACGGGATCGTATCTTCCGGGTTTTTGACTACCTGGTGCTGGGGGCTCCGGGATCGTTGGATCCGATGTATCGCCAGTGGTTTATTGAAGCGGATTCAGAATCGGAACAGATCCGTGTGATTGTTGATCAGATTGCGTCGATGACGGAGTCTCGTCTGGAACGCCTTGCCCGGAATGCTGCTGACATCTCAGGATTCTTGGGATAGTTGGTTAGAGCAGCAGCGATTTTTAGTAAGGCCAATAACATGTTTTGGCTTAAACCTGTGTCGTGTCAGATGGTGGCGAAGTAGAGTTCGCAAAGCTAGCGAACATGAATTCGTGTTCAGGAACTTAACAGGGATCAAACAGAGAACAGAGAACAGATCACGCTGCCCAAAAATCGCACTTTTAAGGTTTGTGGGCGTCTGTGTGTGGTTTGCCGCTGTAAAGTATCACCACGTTATGCGCCCTGGTGTGATCAAGCGTTCGTTCTGGGTCGAAACCCCAAAAGTCACAATTCCCCAGAAGCGGGTCAAACCCATTTAGCTTATTGCTTACATATCGAGGGTTTAGAAAAGTGATTTGTCGGATCAGTCGGTTTCTGCCAAGTAAATAGAACTTTATAAATTTTGTGGCTCTCAAATCTTAGGCCACGGCTTCCGATTTGAACCGGAGGTTCAAAAGGCTTATATAGACAAGATTCTGCATCGTCTCACGAGCCCCTCATTGCCTGACACGGTCAATCGTGTGGGAGGTACCAATCCGTGAGATTTCTGCCAACGAGCGATTCATTGGCCCCGCTGCAGAGCTGGCAGAACACGGACATAACCCAAATAATCTGAGGTCTGCCGTTTGCAGCAGCATTAGCGTTTGATGTGGAAGGTGATGCAGAGGCTGTTGATCTGCAAGCGCGTCTTTCCCAAGCACGGGGGAACCCTGAAGCATCGGATGCTCTAGTTGCTGAGCTGACTGGTGTTACTGCTAATCATCCGTTGGTCAGTGCTTGTCTGAAGTTTCCGCTCAATCCTAAGCTTCTCAAGATTTCGTAAAAAAGCTGCCAACTACCGTAAAACCGCACTACTAGAGGAGTGCGTTTTTCGTTCCTGAACACATTGCGTGCTGCAACTTAATTATGGTCCTCCCAGCTCAGTGTGCTGTGTGGATTGTTTATTCTCGTCCATTAAGTGATCGAGAAAAAGTTGTTGTAAAGTCATGCGCATGTGTGGAATTGTTGGATATATTGGCCAGGCGGGCGACTCCCGTGATTACTTTGCGCTTGACGTCGTTTTAGAAGGACTGCGCCGACTTGAATACCGCGGTTATGATTCCGCAGGTGTAGCTGTTCATGCGAACGGTGAAATCAGCTACCGAAAGAAGGCTGGAAAGGTAGCTGCGCTGGACGCTGAGATCGCTCGCGCTCCTTTGGCGGATTCCATTTTGGCTATTGGTCACACCCGGTGGGCAACTCACGGTGGACCAACCGATGCAAATGCACACCCCCATGTTGTTGATGGCGGCAAGTTAGCTGTCGTACACAACGGTATTATTGAAAACTTTGCAGAGCTGCGCGCAGAGCTTTCAGCTAAGGGCTACAACTTTGTTTCCGTTACTGACACTGAAGTTGCCGCCACATTGCTGGCAGAAATCTACAACACCCAGGCTAATGGCGATCTGACCAAGGCTATGCAGCTTACTGGTCAGCGTCTTGAGGGTGCGTTCACCCTGCTGGCTATCCATGCTGATCATGATGATCGTATTGTTGCAGCGCGCCGTAACTCTCCTTTGGTTATTGGCTTGGGTGAAGGCGAAAACTTCCTCGGCTCTGACGTTTCTGGCTTCATCGATTACACCCGCAAGGCTGTTGAGATGGGCAACGATCAGATTGTGACCATCACTGCGAACGACTACCAGATCACCAACTTCGATGGTTCTGAGGCAACCGGAAAACCTTTCGACGTGGAGTGGGATGCGGCTGCTGCTGAAAAGGGTGGCTTTGATTCCTTCATGGATAAGGAAATCCACGACCAGCCAGCTGCAGTGCGTGACACCCTCCTCGGACGTTTAGATGAGGATGGCAAGCTGGTCCTTGATGAGCTTCGT diff --git a/test/test_parsemarkers.py b/test/test_parsemarkers.py new file mode 100644 index 00000000..7dace124 --- /dev/null +++ b/test/test_parsemarkers.py @@ -0,0 +1,37 @@ +import unittest +import vamb +import testtools +from pathlib import Path +import tempfile +import shutil +import io + + +class TestParseMarkers(unittest.TestCase): + def test_instantiate(self): + tmp = tempfile.mkdtemp() + tmp_path = Path(tmp) + shutil.rmtree(tmp) + markers = vamb.parsemarkers.Markers.from_files( + Path(testtools.DATADIR).joinpath("marker.fna"), + Path(testtools.PARENTDIR).joinpath("vamb").joinpath("marker.hmm"), + ["abc"], + tmp_path, + 4, + None, + ) + self.assertIsNotNone(markers.markers[0]) + self.assertEqual(len(markers.markers), 1) + self.assertEqual(set(markers.markers[0]), {39}) + self.assertEqual( + markers.refhash, vamb.vambtools.RefHasher.hash_refnames(["abc"]) + ) + + buf = io.StringIO() + markers.save(buf) + buf.seek(0) + + markers2 = vamb.parsemarkers.Markers.load(buf, markers.refhash) + self.assertEqual(len(markers.markers), len(markers2.markers)) + self.assertEqual(set(markers.markers[0]), set(markers2.markers[0])) + self.assertEqual(markers.marker_names, markers2.marker_names) diff --git a/test/test_reclustering.py b/test/test_reclustering.py index 0118f74b..6c250039 100644 --- a/test/test_reclustering.py +++ b/test/test_reclustering.py @@ -1,8 +1,5 @@ import unittest -# Invariants: -# It produces disjoint clusters, a subset of the input points - # For CAMI dataset, compute comp, abundance, taxonomy, markers # Subset to e.g. 5 genera plus a few unclassified contigs @@ -22,3 +19,8 @@ class TestKmeansReclustering(unittest.TestCase): # Create latent # Initial clustering + + +class TestDBScanReclustering(unittest.TestCase): + # It produces disjoint clusters, a subset of the input points + pass diff --git a/test/test_semisupervised_encode.py b/test/test_semisupervised_encode.py index 27369341..6472e3ab 100644 --- a/test/test_semisupervised_encode.py +++ b/test/test_semisupervised_encode.py @@ -31,14 +31,14 @@ def make_random_annotation(self): phylum = np.random.choice(self.phyla, 1)[0] clas = np.random.choice(self.classes[phylum], 1)[0] if np.random.random() <= 0.2: - return vamb.vambtools.ContigTaxonomy.from_semicolon_sep( + return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep( ";".join([self.domain]) ) if 0.2 < np.random.random() <= 0.5: - return vamb.vambtools.ContigTaxonomy.from_semicolon_sep( + return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep( ";".join([self.domain, phylum]) ) - return vamb.vambtools.ContigTaxonomy.from_semicolon_sep( + return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep( ";".join([self.domain, phylum, clas]) ) diff --git a/vamb/__init__.py b/vamb/__init__.py index 9981660c..10dce8c5 100644 --- a/vamb/__init__.py +++ b/vamb/__init__.py @@ -16,6 +16,7 @@ from . import parsebam from . import parsecontigs from . import parsemarkers +from . import taxonomy from . import cluster from . import encode from . import aamb_encode @@ -35,6 +36,7 @@ "parsebam", "parsecontigs", "parsemarkers", + "taxonomy", "cluster", "encode", "aamb_encode", diff --git a/vamb/__main__.py b/vamb/__main__.py index d05863d2..b5f73fbd 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -15,7 +15,6 @@ from typing import Optional, Tuple, Union, cast, Callable, Literal, NamedTuple from pathlib import Path from collections.abc import Sequence -from collections import defaultdict from torch.utils.data import DataLoader from functools import partial from loguru import logger @@ -377,11 +376,18 @@ def __init__( class TaxonomyPath: @classmethod def from_args(cls, args: argparse.Namespace): + if args.taxonomy is None: + raise argparse.ArgumentTypeError( + "Cannot load taxonomy without specifying --taxonomy" + ) return cls(typeasserted(args.taxonomy, Path)) def __init__(self, path: Path): self.path = check_existing_file(path) + def get_tax_path(self) -> Path: + return self.path + class TaxometerOptions: @classmethod @@ -450,6 +456,13 @@ def from_args(cls, args: argparse.Namespace): def __init__(self, path_or_tax_options: Union[TaxonomyPath, TaxometerOptions]): self.path_or_tax_options = path_or_tax_options + def get_tax_path(self) -> Path: + p = self.path_or_tax_options + if isinstance(p, TaxonomyPath): + return p.get_tax_path() + else: + return p.taxonomy.get_tax_path() + class MarkerPath: def __init__(self, path: Path): @@ -940,6 +953,41 @@ def load_composition_and_abundance( return (composition, abundance) +def load_markers( + options: MarkerOptions, + comp_metadata: vamb.parsecontigs.CompositionMetaData, + existing_outdir: Path, + n_threads: int, +) -> vamb.parsemarkers.Markers: + logger.info("Loading markers") + begin_time = time.time() + if isinstance(options.content, MarkerPath): + path = options.content.path + logger.info(f'\tLoading markers from existing `markers.npz` at "{path}"') + markers = vamb.parsemarkers.Markers.load(path, comp_metadata.refhash) + elif isinstance(options.content, PredictMarkers): + logger.info("\tPredicting markers. This might take some time") + logger.info(f"\t\tFASTA file located at {options.content.fasta.path}") + logger.info( + f"\t\tHMM profile file (.hmm file) located at {options.content.hmm}" + ) + markers = vamb.parsemarkers.Markers.from_files( + options.content.fasta.path, + options.content.hmm, + cast(Sequence[str], comp_metadata.identifiers), + existing_outdir.joinpath("tmp_markers"), + n_threads, + comp_metadata.refhash, + ) + markers.save(existing_outdir.joinpath("markers.npz")) + else: + assert False + + elapsed = round(time.time() - begin_time, 2) + logger.info(f"\tProcessed markers in {elapsed} seconds.\n") + return markers + + def trainvae( vae_options: VAEOptions, vamb_options: GeneralOptions, @@ -1267,26 +1315,26 @@ def run_bin_aae(opt: BinAvambOptions): def predict_taxonomy( + comp_metadata: vamb.parsecontigs.CompositionMetaData, abundance: np.ndarray, tnfs: np.ndarray, lengths: np.ndarray, - contignames: np.ndarray, - taxonomy_path: Path, out_dir: Path, taxonomy_options: TaxometerOptions, cuda: bool, -): +) -> vamb.taxonomy.PredictedTaxonomy: begintime = time.time() + logger.info("Predicting taxonomy with Taxometer") - taxonomies = vamb.vambtools.parse_taxonomy( - input_file=taxonomy_path, - contignames=cast(Sequence[str], contignames), - is_canonical=False, + taxonomies = vamb.taxonomy.Taxonomy.from_file( + taxonomy_options.taxonomy.path, comp_metadata, False + ) + nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph( + taxonomies.contig_taxonomies ) - nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(taxonomies) - logger.info(f"{len(nodes)} nodes in the graph") + logger.info(f"\t{len(nodes)} nodes in the graph") classes_order: list[str] = [] - for i in taxonomies: + for i in taxonomies.contig_taxonomies: if i is None or len(i.ranks) == 0: classes_order.append("root") else: @@ -1342,36 +1390,31 @@ def predict_taxonomy( nodes_ar = np.array(nodes) logger.info(f"Using threshold {taxonomy_options.softmax_threshold}") - row = 0 - output_exists = False + contig_taxonomies: list[vamb.taxonomy.PredictedContigTaxonomy] = [] for predicted_vector, predicted_labels in model.predict(dataloader_vamb): - N = len(predicted_vector) - predictions: list[str] = [] - probs: list[str] = [] for i in range(predicted_vector.shape[0]): label = predicted_labels[i] while table_parent[label] != -1: label = table_parent[label] - threshold_mask = predicted_vector[i] > taxonomy_options.softmax_threshold - pred_line = ";".join(nodes_ar[threshold_mask][1:]) - predictions.append(pred_line) - absolute_probs = predicted_vector[i][threshold_mask] - absolute_prob = ";".join(map(str, absolute_probs[1:])) - probs.append(absolute_prob) - - with open(predicted_path, "a") as file: - if not output_exists: - print("contigs\tpredictions\tlengths\tscores", file=file) - for contig, length, prediction, prob in zip( - contignames[row : row + N], predictions, lengths[row : row + N], probs - ): - print(contig, length, prediction, prob, file=file, sep="\t") - output_exists = True - row += N + threshold_mask: Sequence[bool] = ( + predicted_vector[i] > taxonomy_options.softmax_threshold + ) + ranks = list(nodes_ar[threshold_mask][1:]) + probs = predicted_vector[i][threshold_mask][1:] + tax = vamb.taxonomy.PredictedContigTaxonomy( + vamb.taxonomy.ContigTaxonomy(ranks), probs + ) + contig_taxonomies.append(tax) + + taxonomy = vamb.taxonomy.PredictedTaxonomy(contig_taxonomies, comp_metadata, False) + + with open(predicted_path, "w") as file: + taxonomy.write_as_tsv(file, comp_metadata) logger.info( f"Completed taxonomy predictions in {round(time.time() - begintime, 2)} seconds." ) + return taxonomy def run_taxonomy_predictor(opt: TaxometerOptions): @@ -1381,18 +1424,17 @@ def run_taxonomy_predictor(opt: TaxometerOptions): opt.abundance, vamb.vambtools.BinSplitter.inert_splitter(), ) - abundance, tnfs, lengths, contignames = ( + abundance, tnfs, lengths, _ = ( abundance.matrix, composition.matrix, composition.metadata.lengths, composition.metadata.identifiers, ) predict_taxonomy( + comp_metadata=composition.metadata, abundance=abundance, tnfs=tnfs, lengths=lengths, - contignames=contignames, - taxonomy_path=opt.taxonomy.path, out_dir=opt.general.out_dir, taxonomy_options=opt, cuda=opt.general.cuda, @@ -1415,32 +1457,29 @@ def run_vaevae(opt: BinTaxVambOptions): ) if isinstance(opt.taxonomy.path_or_tax_options, TaxometerOptions): logger.info("Predicting missing values from taxonomy") - predict_taxonomy( + predicted_contig_taxonomies = predict_taxonomy( + comp_metadata=composition.metadata, abundance=abundance, tnfs=tnfs, lengths=lengths, - contignames=contignames, - taxonomy_path=opt.taxonomy.path_or_tax_options.taxonomy.path, out_dir=opt.common.general.out_dir, taxonomy_options=opt.taxonomy.path_or_tax_options, cuda=opt.common.general.cuda, ) - contig_taxonomies = vamb.vambtools.parse_taxonomy( - input_file=opt.common.general.out_dir.joinpath("results_taxometer.tsv"), - contignames=cast(Sequence[str], contignames), - is_canonical=False, - ) + contig_taxonomies = predicted_contig_taxonomies.to_taxonomy() else: logger.info("Not predicting the taxonomy") - contig_taxonomies = vamb.vambtools.parse_taxonomy( - input_file=opt.taxonomy.path_or_tax_options.path, - contignames=cast(Sequence[str], contignames), - is_canonical=False, + contig_taxonomies = vamb.taxonomy.Taxonomy.from_file( + opt.taxonomy.path_or_tax_options.path, + composition.metadata, + False, ) - nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(contig_taxonomies) + nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph( + contig_taxonomies.contig_taxonomies + ) classes_order: list[str] = [] - for i in contig_taxonomies: + for i in contig_taxonomies.contig_taxonomies: if i is None or len(i.ranks) == 0: classes_order.append("root") else: @@ -1529,6 +1568,9 @@ def run_vaevae(opt: BinTaxVambOptions): ) +# TODO: The whole data flow around predict_taxonomy needs to change. +# Ideally, we should have a "get taxonomy" function that loads, possibly refines, +# and then returns the taxonomy object. def run_reclustering(opt: ReclusteringOptions): composition = calc_tnf( opt.composition, @@ -1536,35 +1578,81 @@ def run_reclustering(opt: ReclusteringOptions): opt.general.out_dir, opt.output.binsplitter, ) - logger.info(f"{len(composition.metadata.identifiers)} contig names after filtering") - logger.info("Taxonomy predictions are provided") - markers = opt.markers.content - assert isinstance(markers, PredictMarkers) - reclustered = vamb.reclustering.recluster_bins( - None if isinstance(opt.algorithm, DBScanOptions) else opt.algorithm.clusters, - opt.latent_path, - str(opt.composition.path.path), - composition.metadata.identifiers, - out_dir=opt.general.out_dir, - minfasta=0, - binned_length=1000, - num_process=40, - random_seed=opt.general.seed, - hmmout_path=markers.hmm, - algorithm="kmeans" if isinstance(opt.algorithm, KmeansOptions) else "dbscan", - predictions_path=None - if isinstance(opt.algorithm, KmeansOptions) - else ( - opt.algorithm.taxonomy_options.path_or_tax_options.path - if isinstance( - opt.algorithm.taxonomy_options.path_or_tax_options, TaxonomyPath + markers = load_markers( + opt.markers, composition.metadata, opt.general.out_dir, opt.general.n_threads + ) + latent = vamb.vambtools.read_npz(opt.latent_path) + alg = opt.algorithm + + if isinstance(alg, DBScanOptions): + # If we should refine or not. + if isinstance(alg.taxonomy_options.path_or_tax_options, TaxometerOptions): + taxopt = alg.taxonomy_options.path_or_tax_options + abundance = calc_abundance( + taxopt.abundance, + taxopt.general.out_dir, + taxopt.general.refcheck, + composition.metadata, + taxopt.general.n_threads, ) - else opt.algorithm.taxonomy_options.path_or_tax_options.taxonomy.path - ), - ) - cluster_dict: defaultdict[str, set[str]] = defaultdict(set) - for k, v in zip(reclustered, composition.metadata.identifiers): - cluster_dict[cast(str, k)].add(v) + predicted_tax = predict_taxonomy( + composition.metadata, + abundance.matrix, + composition.matrix, + composition.metadata.lengths, + taxopt.general.out_dir, + taxopt, + taxopt.general.cuda, + ) + taxonomy = predicted_tax.to_taxonomy() + else: + tax_path = alg.taxonomy_options.path_or_tax_options.path + logger.info(f'Loading taxonomy from file "{tax_path}"') + taxonomy = vamb.taxonomy.Taxonomy.from_file( + tax_path, composition.metadata, True + ) + instantiated_alg = vamb.reclustering.DBScanAlgorithm( + composition.metadata, taxonomy, opt.general.n_threads + ) + logger.info("Reclustering") + logger.info("\tAlgorithm: DBSCAN") + reclustered_contigs = vamb.reclustering.recluster_bins( + markers, latent, instantiated_alg + ) + else: + with open(alg.clusters) as file: + clusters = vamb.vambtools.read_clusters(file) + contig_to_id: dict[str, int] = { + c: i for (i, c) in enumerate(composition.metadata.identifiers) + } + clusters_as_ids: list[set[vamb.reclustering.ContigId]] = [] + for cluster in clusters.values(): + s = set() + for contig in cluster: + i = contig_to_id.get(contig, None) + if i is None: + raise ValueError( + f'Contig "{contig}" found in the provided clusters file is not found in the ' + "provided composition." + ) + s.add(vamb.reclustering.ContigId(i)) + clusters_as_ids.append(s) + instantiated_alg = vamb.reclustering.KmeansAlgorithm( + clusters_as_ids, + abs(opt.general.seed) % 4294967295, # Note: sklearn seeds must be uint32 + composition.metadata.lengths, + ) + logger.info("Reclustering") + logger.info("\tAlgorithm: KMeans") + reclustered_contigs = vamb.reclustering.recluster_bins( + markers, latent, instantiated_alg + ) + + logger.info("\tReclustering complete") + identifiers = composition.metadata.identifiers + clusters_dict: dict[str, set[str]] = dict() + for i, cluster in enumerate(reclustered_contigs): + clusters_dict[str(i)] = {identifiers[c] for c in cluster} if opt.output.min_fasta_output_size is None: fasta_output = None @@ -1580,7 +1668,7 @@ def run_reclustering(opt: ReclusteringOptions): fasta_output, opt.output.binsplitter, str(opt.general.out_dir.joinpath("clusters_reclustered")), - cluster_dict, + clusters_dict, cast(Sequence[str], composition.metadata.identifiers), cast(Sequence[int], composition.metadata.lengths), ) @@ -1857,6 +1945,15 @@ def add_predictor_arguments(subparser): default=1024, help=argparse.SUPPRESS, ) + pred_trainos.add_argument( + "-pq", + dest="pred_batchsteps", + metavar="", + type=int, + nargs="*", + default=[25, 75], + help=argparse.SUPPRESS, + ) pred_trainos.add_argument( "-pthr", dest="pred_softmax_threshold", @@ -2164,6 +2261,7 @@ def main(): add_marker_arguments(recluster_parser) add_bin_output_arguments(recluster_parser) add_reclustering_arguments(recluster_parser) + add_predictor_arguments(recluster_parser) add_taxonomy_arguments(recluster_parser) args = parser.parse_args() diff --git a/vamb/parsemarkers.py b/vamb/parsemarkers.py index d7210c9e..7d149a2c 100644 --- a/vamb/parsemarkers.py +++ b/vamb/parsemarkers.py @@ -18,8 +18,9 @@ from collections import defaultdict import json import numpy as np +from loguru import logger -MarkerID = NewType("Marker", int) +MarkerID = NewType("MarkerID", int) MarkerName = NewType("MarkerName", str) ContigID = NewType("ContigID", int) ContigName = NewType("ContigName", str) @@ -123,9 +124,9 @@ def from_files( cls, contigs: Path, hmm_path: Path, + contignames: Sequence[str], tmpdir_to_create: Path, n_processes: int, - fasta_entry_mask: Sequence[bool], target_refhash: Optional[bytes], ): """ @@ -141,37 +142,16 @@ def from_files( If the target refhash is not None, and the computed reference hash does not match, an exception is thrown. See vamb.vambtools.RefHasher. """ - if n_processes < 1: - raise ValueError(f"Must use at least 1 process, not {n_processes}") - # Cap processes, because most OSs cap the number of open file handles, - # and we need one file per process when splitting FASTA file - elif n_processes > 64: - print(f"Warning: Processes set to {n_processes}, capping to 64") - n_processes = 64 - name_to_id: dict[MarkerName, MarkerID] = dict() - - # Create the list of marker names, translating those that are the same, - # but appear under two marker names + n_processes = cap_processes(n_processes) with open(hmm_path, "rb") as file: - for hmm in pyhmmer.plan7.HMMFile(file): - name = hmm.name.decode() - if name in NORMALIZE_MARKER_TRANS_DICT: - continue - name_to_id[MarkerName(name)] = MarkerID(len(name_to_id)) - for old_name, new_name in NORMALIZE_MARKER_TRANS_DICT.items(): - name_to_id[MarkerName(old_name)] = name_to_id[MarkerName(new_name)] - id_to_names: defaultdict[MarkerID, list[MarkerName]] = defaultdict(list) - for name, id in name_to_id.items(): - id_to_names[id].append(name) - marker_names = [id_to_names[MarkerID(i)] for i in range(len(id_to_names))] - - # For safety: Verify that there are no more than 256 MarkerIDs, such that we can - # store them in an uint8 array without overflow (which does not throw errors - # in the current version of Numpy) - assert len(marker_names) <= 256 + hmms = list(pyhmmer.plan7.HMMFile(file)) + (_, marker_names) = get_name_to_id(hmms) (refhash, paths) = split_file( - contigs, tmpdir_to_create, n_processes, fasta_entry_mask + contigs, + contignames, + tmpdir_to_create, + n_processes, ) if target_refhash is not None: @@ -179,16 +159,17 @@ def from_files( refhash, target_refhash, "Markers FASTA file", None, None ) - marker_list: list[Optional[np.ndarray]] = [None] * sum(fasta_entry_mask) + index_of_name = { + ContigName(n): ContigID(i) for (i, n) in enumerate(contignames) + } + marker_list: list[Optional[np.ndarray]] = [None] * len(contignames) with Pool(n_processes) as pool: for sub_result in pool.imap_unordered( work_per_process, - list( - zip(paths, itertools.repeat(hmm_path), itertools.repeat(name_to_id)) - ), + list(zip(paths, itertools.repeat(hmms))), ): - for contig_id, markers in sub_result: - marker_list[contig_id] = markers + for contig_name, markers in sub_result: + marker_list[index_of_name[contig_name]] = markers shutil.rmtree(tmpdir_to_create) markers = cls(marker_list, marker_names, refhash) @@ -196,6 +177,17 @@ def from_files( return markers +def cap_processes(processes: int) -> int: + if processes < 1: + raise ValueError(f"Must use at least 1 process, not {processes}") + # Cap processes, because most OSs cap the number of open file handles, + # and we need one file per process when splitting FASTA file + elif processes > 64: + logger.warning(f"Processes set to {processes}, capping to 64") + return 64 + return processes + + # Some markers have different names, but should be treated as the same SCG. NORMALIZE_MARKER_TRANS_DICT = { "TIGR00388": "TIGR00389", @@ -205,31 +197,25 @@ def from_files( } -def filter_contigs(reader: Reader, mask: Sequence[bool]) -> Iterable[FastaEntry]: - for record, keep in itertools.zip_longest(byte_iterfasta(reader), mask): - if record is None or keep is None: - raise ValueError( - "The mask length does not match the length of FASTA records" - ) - if keep: - yield record - - def split_file( - input: Path, tmpdir_to_create: Path, n_splits: int, mask: Sequence[bool] + input: Path, + contignames: Sequence[str], + tmpdir_to_create: Path, + n_splits: int, ) -> tuple[bytes, list[Path]]: + names = set(contignames) os.mkdir(tmpdir_to_create) paths = [tmpdir_to_create.joinpath(str(i)) for i in range(n_splits)] filehandles = [open(path, "w") for path in paths] refhasher = RefHasher() with Reader(input) as infile: - records = filter_contigs(infile, mask) - # We write the index to the record and store that instead of the name. for i, (outfile, record) in enumerate( - zip(itertools.cycle(filehandles), records) + zip( + itertools.cycle(filehandles), + filter(lambda x: x.identifier in names, byte_iterfasta(infile)), + ) ): refhasher.add_refname(record.identifier) - record.identifier = str(i) print(record.format(), file=outfile) for filehandle in filehandles: @@ -243,12 +229,12 @@ def process_chunk( hmms: list[pyhmmer.plan7.HMM], name_to_id: dict[MarkerName, MarkerID], finder: pyrodigal.GeneFinder, -) -> list[tuple[ContigID, np.ndarray]]: +) -> list[tuple[ContigName, np.ndarray]]: # We temporarily store them as sets in order to deduplicate. While single contigs # may have duplicate markers, it makes no sense to count this as contamination, # because we are not about to second-guess the assembler's job of avoiding # chimeric sequences. - markers: defaultdict[ContigID, set[MarkerID]] = defaultdict(set) + markers: defaultdict[ContigName, set[MarkerID]] = defaultdict(set) alphabet = pyhmmer.easel.Alphabet.amino() digitized: list[pyhmmer.easel.DigitalSequence] = [] for record in chunk: @@ -267,27 +253,24 @@ def process_chunk( assert score_cutoff is not None for hit in top_hits: if hit.score >= score_cutoff: - markers[ContigID(int(hit.name.decode()))].add(marker_id) + markers[ContigName(hit.name.decode())].add(marker_id) return [ (name, np.array(list(ids), dtype=np.uint8)) for (name, ids) in markers.items() ] -# We avoid moving data between processes as much as possible, so each process -# only needs these two paths, and this marker name dict which we assume to be small. -# The return type here is optimised for a small memory footprint. def work_per_process( - args: tuple[Path, Path, dict[MarkerName, MarkerID]], -) -> list[tuple[ContigID, np.ndarray]]: - (contig_path, hmmpath, name_to_id) = args - with open(hmmpath, "rb") as file: - hmms = list(pyhmmer.plan7.HMMFile(file)) + args: tuple[Path, list[pyhmmer.plan7.HMM]], +) -> list[tuple[ContigName, np.ndarray]]: + (contig_path, hmms) = args + + (name_to_id, _) = get_name_to_id(hmms) # Chunk up the FASTA file for memory efficiency reasons, while still # allowing pyhmmer to scan multiple sequences at once for speed chunk: list[FastaEntry] = [] - result: list[tuple[ContigID, np.ndarray]] = [] + result: list[tuple[ContigName, np.ndarray]] = [] finder = pyrodigal.GeneFinder(meta=True) with open(contig_path, "rb") as file: for record in byte_iterfasta(file): @@ -298,3 +281,26 @@ def work_per_process( result.extend(process_chunk(chunk, hmms, name_to_id, finder)) return result + + +def get_name_to_id( + hmms: list[pyhmmer.plan7.HMM], +) -> tuple[dict[MarkerName, MarkerID], list[list[MarkerName]]]: + name_to_id: dict[MarkerName, MarkerID] = dict() + for hmm in hmms: + name = hmm.name.decode() + if name in NORMALIZE_MARKER_TRANS_DICT: + continue + name_to_id[MarkerName(name)] = MarkerID(len(name_to_id)) + for old_name, new_name in NORMALIZE_MARKER_TRANS_DICT.items(): + name_to_id[MarkerName(old_name)] = name_to_id[MarkerName(new_name)] + + if len(set(name_to_id.values())) > 256: + raise ValueError("Maximum 256 marker IDs") + + id_to_names: defaultdict[MarkerID, list[MarkerName]] = defaultdict(list) + for n, i in name_to_id.items(): + id_to_names[i].append(n) + marker_names = [id_to_names[MarkerID(i)] for i in range(len(id_to_names))] + + return name_to_id, marker_names diff --git a/vamb/reclustering.py b/vamb/reclustering.py index 019acbc1..0c4cfcc8 100644 --- a/vamb/reclustering.py +++ b/vamb/reclustering.py @@ -1,563 +1,342 @@ """ -The following code is a modification of a k-means based reclustering algorithm first published at https://github.com/BigDataBiology/SemiBin +The following code is based on the k-means based reclustering algorithm first published at https://github.com/BigDataBiology/SemiBin The original code is distributed under MIT License. """ -import os -import subprocess -import sys -import contextlib from sklearn.cluster import KMeans import numpy as np from collections import defaultdict -import pandas as pd from sklearn.cluster import DBSCAN from sklearn.metrics import pairwise_distances -import gzip -import lzma -from loguru import logger -import vamb - - -def get_best_bin( - results_dict, contignames, contig_to_marker, namelist, contig_dict, minfasta -): - # There is room for improving the loop below to avoid repeated computation - # but it runs very fast in any case - for max_contamination in [0.1, 0.2, 0.3, 0.4, 0.5, 1]: - max_F1 = 0 - weight_of_max = 1e9 - max_bin = None - for res_labels in results_dict.values(): - res = defaultdict(list) - for label, name in zip(res_labels, namelist): - if label != -1: - res[label].append(name) - for bin_contig in res.values(): - cur_weight = sum(len(contig_dict[contig]) for contig in bin_contig) - if cur_weight < minfasta: - continue - marker_list = [] - for contig in bin_contig: - marker_list.extend(contig_to_marker[contig]) - if len(marker_list) == 0: - continue - recall = len(set(marker_list)) / 104 - contamination = (len(marker_list) - len(set(marker_list))) / len( - marker_list - ) - if contamination <= max_contamination: - F1 = ( - 2 - * recall - * (1 - contamination) - / (recall + (1 - contamination)) - ) - if F1 > max_F1: - max_F1 = F1 - weight_of_max = cur_weight - max_bin = bin_contig - elif F1 == max_F1 and cur_weight <= weight_of_max: - weight_of_max = cur_weight - max_bin = bin_contig - if max_F1 > 0: # if there is a bin with F1 > 0 - return max_bin - - -def fasta_iter(fname, full_header=False): - """Iterate over a (possibly gzipped) FASTA file - - Parameters - ---------- - fname : str - Filename. - If it ends with .gz, gzip format is assumed - If .bz2 then bzip2 format is assumed - if .xz, then lzma format is assumerd - full_header : boolean (optional) - If True, yields the full header. Otherwise (the default), only the - first word - - Yields - ------ - (h,seq): tuple of (str, str) - """ - header = None - chunks = [] - if hasattr(fname, "readline"): - - def op(f, _): - return f - - elif fname.endswith(".gz"): - op = gzip.open - elif fname.endswith(".bz2"): - import bz2 - - op = bz2.open - elif fname.endswith(".xz"): - op = lzma.open - else: - op = open - with op(fname, "rt") as f: - for line in f: - if line[0] == ">": - if header is not None: - yield header, "".join(chunks) - line = line[1:].strip() - if not line: - header = "" - elif full_header: - header = line.strip() - else: - header = line.split()[0] - chunks = [] - else: - chunks.append(line.strip()) - if header is not None: - yield header, "".join(chunks) - - -normalize_marker_trans__dict = { - "TIGR00388": "TIGR00389", - "TIGR00471": "TIGR00472", - "TIGR00408": "TIGR00409", - "TIGR02386": "TIGR02387", -} - - -def replace_bin_names(data, clusters_labels_map): - """ - The 'orf' column is formatted as 'bin.contig_id' - but there is no need to search again if only the bins are different but the contigs are the same. - This function replaces bin names in the 'orf' column - with the new bin names from another cluster file given the same contigs. - - TODO: refactor the whole file so there is no need for that - - Input: - - data: pd.DataFrame - - clusters_labels_map: dict, {contig: bin} - - Returns a copy of the dataframe with the updated 'orf' column - """ - data = data.copy() - data["contig_number"] = data["orf"].str.split(pat=".", n=1, expand=True)[1] - data["contig_only"] = data["contig_number"].map(lambda x: x.rsplit("_", 1)[0]) - data["old_bin"] = data["orf"].str.split(pat=".", n=0, expand=True)[0] - data["new_bin"] = data["contig_only"].map( - lambda x: f"bin{clusters_labels_map[x]:06}" - ) - data["orf"] = data[["new_bin", "contig_number"]].agg(".".join, axis=1) - return data - - -def get_marker( - hmmout, - fasta_path=None, - min_contig_len=None, - multi_mode=False, - orf_finder=None, - contig_to_marker=False, - clusters_dict=None, -): - """Parse HMM output file and return markers""" - data = pd.read_table( - hmmout, - sep=r"\s+", - comment="#", - header=None, - usecols=(0, 3, 5, 15, 16), - names=["orf", "gene", "qlen", "qstart", "qend"], - ) - if clusters_dict is not None: - data = replace_bin_names(data, clusters_dict) - if not len(data): - return [] - data["gene"] = data["gene"].map(lambda m: normalize_marker_trans__dict.get(m, m)) - qlen = data[["gene", "qlen"]].drop_duplicates().set_index("gene")["qlen"] - - def contig_name(ell): - if orf_finder in ["prodigal", "fast-naive"]: - contig, _ = ell.rsplit("_", 1) - else: - contig, _, _, _ = ell.rsplit("_", 3) - return contig - - data = data.query("(qend - qstart) / qlen > 0.4").copy() - data["contig"] = data["orf"].map(contig_name) - if min_contig_len is not None: - contig_len = {h.split(".", 1)[1]: len(seq) for h, seq in fasta_iter(fasta_path)} - data = data[ - data["contig"].map( - lambda c: contig_len[c.split(".", 1)[1]] >= min_contig_len +from vamb.taxonomy import Taxonomy +from vamb.parsemarkers import Markers, MarkerID +from vamb.parsecontigs import CompositionMetaData +from vamb.vambtools import RefHasher +from collections.abc import Sequence, Iterable +from typing import NewType, Optional, Union + +# We use these aliases to be able to work with integers, which is faster. +ContigId = NewType("ContigId", int) +BinId = NewType("BinId", int) + +# TODO: We might want to benchmark the best value for this constant. +# Right now, we do too much duplicated work by clustering 18 times. +EPS_VALUES = np.arange(0.01, 0.35, 0.02) + + +class KmeansAlgorithm: + "Arguments needed specifically when using the KMeans algorithm" + + def __init__( + self, clusters: list[set[ContigId]], random_seed: int, contiglengths: np.ndarray + ): + assert np.issubdtype(contiglengths.dtype, np.integer) + self.contiglengths = contiglengths + self.clusters = clusters + self.random_seed = random_seed + + +class DBScanAlgorithm: + "Arguments needed specifically when using the DBScan algorithm" + + def __init__( + self, comp_metadata: CompositionMetaData, taxonomy: Taxonomy, n_processes: int + ): + if not taxonomy.is_canonical: + raise ValueError( + "Can only run DBScan on a Taxonomy object with is_canonical set" ) - ] - data = data.drop_duplicates(["gene", "contig"]) - - if contig_to_marker: - from collections import defaultdict - - marker = data["gene"].values - contig = data["contig"].str.lsplit(".").str[-1].values - sequence2markers = defaultdict(list) - for m, c in zip(marker, contig): - sequence2markers[c].append(m) - return sequence2markers - else: - - def extract_seeds(vs, sel): - vs = vs.sort_values() - median = vs[len(vs) // 2] - - # the original version broke ties by picking the shortest query, so we - # replicate that here: - candidates = vs.index[vs == median] - c = qlen.loc[candidates].idxmin() - r = list(sel[sel["gene"] == c]["contig"]) - r.sort() - return r - - if multi_mode: - data["bin"] = data["orf"].str.split(pat=".", n=0, expand=True)[0] - counts = data.groupby(["bin", "gene"])["orf"].count() - res = {} - for b, vs in counts.groupby(level=0): - cs = extract_seeds( - vs.droplevel(0), data.query("bin == @b", local_dict={"b": b}) - ) - res[b] = [c.split(".", 1)[1] for c in cs] - return res - else: - counts = data.groupby("gene")["orf"].count() - return extract_seeds(counts, data) - - -def run_prodigal(fasta_path, num_process, output): - contigs = {} - for h, seq in fasta_iter(fasta_path): - contigs[h] = seq - - total_len = sum(len(s) for s in contigs.values()) - split_len = total_len // num_process - - cur = split_len + 1 - next_ix = 0 - out = None - with contextlib.ExitStack() as stack: - for h, seq in contigs.items(): - if cur > split_len and next_ix < num_process: - if out is not None: - out.close() - out = open(os.path.join(output, "contig_{}.fa".format(next_ix)), "wt") - out = stack.enter_context(out) - - cur = 0 - next_ix += 1 - out.write(f">{h}\n{seq}\n") - cur += len(seq) - - try: - process = [] - for index in range(next_ix): - with open( - os.path.join(output, f"contig_{index}_log.txt") + ".out", "w" - ) as prodigal_out_log: - p = subprocess.Popen( - [ - "prodigal", - "-i", - os.path.join(output, f"contig_{index}.fa"), - "-p", - "meta", - "-q", - "-m", # See https://github.com/BigDataBiology/SemiBin/issues/87 - "-a", - os.path.join(output, f"contig_{index}.faa"), - ], - stdout=prodigal_out_log, - ) - process.append(p) - - for p in process: - p.wait() - - except: - sys.stderr.write("Error: Running prodigal fail\n") - sys.exit(1) - - contig_output = os.path.join(output, "contigs.faa") - with open(contig_output, "w") as f: - for index in range(next_ix): - f.write( - open(os.path.join(output, "contig_{}.faa".format(index)), "r").read() - ) - return contig_output - - -def cal_num_bins( - fasta_path, - binned_length, - num_process, - output, - markers_path=None, - clusters_dict=None, - multi_mode=False, - orf_finder="prodigal", -): - """Estimate number of bins from a FASTA file - - Parameters - fasta_path: path - binned_length: int (minimal contig length) - num_process: int (number of CPUs to use) - multi_mode: bool, optional (if True, treat input as resulting from concatenating multiple files) - """ - if markers_path is None: - markers_path = os.path.join(output, "markers.hmmout") - if os.path.exists(markers_path): - return get_marker( - markers_path, - fasta_path, - binned_length, - multi_mode, - orf_finder=orf_finder, - clusters_dict=clusters_dict, + RefHasher.verify_refhash( + taxonomy.refhash, + comp_metadata.refhash, + "taxonomy", + "composition", + None, ) - else: - os.makedirs(output, exist_ok=True) - target_dir = output - - contig_output = run_prodigal(fasta_path, num_process, output) - - hmm_output = os.path.join(target_dir, "markers.hmmout") - try: - with open(os.path.join(output, "markers.hmmout.out"), "w") as hmm_out_log: - subprocess.check_call( - [ - "hmmsearch", - "--domtblout", - hmm_output, - "--cut_tc", - "--cpu", - str(num_process), - os.path.split(__file__)[0] + "/marker.hmm", - contig_output, - ], - stdout=hmm_out_log, - ) - except: - if os.path.exists(hmm_output): - os.remove(hmm_output) - sys.stderr.write("Error: Running hmmsearch fail\n") - sys.exit(1) - - return get_marker( - hmm_output, fasta_path, binned_length, multi_mode, orf_finder=orf_finder - ) + self.contiglengths = comp_metadata.lengths + self.taxonomy = taxonomy + self.n_processes = n_processes def recluster_bins( - clusters_path, - latents_path, - contigs_path, - contignames_all, - minfasta, - binned_length, - num_process, - out_dir, - random_seed, - algorithm, - hmmout_path=None, - predictions_path=None, -): - contig_dict = {h: seq for h, seq in fasta_iter(contigs_path)} - embedding = np.load(latents_path) - if "arr_0" in embedding: - embedding = embedding["arr_0"] - if (algorithm == "dbscan") and predictions_path: - df_gt = pd.read_csv(predictions_path) - column = "predictions" - df_gt["genus"] = df_gt[column].str.split(";").str[5].fillna("mock") - species = list(set(df_gt["genus"])) - species_ind = {s: i for i, s in enumerate(species)} - clusters_labels_map = { - k: species_ind[v] for k, v in zip(df_gt["contigs"], df_gt["genus"]) - } - else: - with open(clusters_path) as file: - clusters = vamb.vambtools.read_clusters(file) - index_of_cluster = {c: i for (i, c) in enumerate(clusters.keys())} - clusters_labels_map: dict[str, int] = dict() - for cluster, contigs in clusters.items(): - for contig in contigs: - clusters_labels_map[contig] = index_of_cluster[cluster] - labels_cluster_map = defaultdict(list) - for k, v in clusters_labels_map.items(): - labels_cluster_map[v].append(k) - indices_contigs = {c: i for i, c in enumerate(contignames_all)} - logger.info(f"\tLatent shape {embedding.shape}") - logger.info(f"\tN contignames for the latent space {len(contignames_all)}") - logger.info(f"\tN contigs in fasta files {len(contig_dict)}") - logger.info(f"\tN contigs in the cluster file {len(clusters_labels_map)}") - - contig_labels = np.array([clusters_labels_map[c] for c in contignames_all]) - - assert len(contig_labels) == embedding.shape[0] - - total_size = defaultdict(int) - for i, c in enumerate(contig_labels): - total_size[c] += len(contig_dict[contignames_all[i]]) - - cfasta = os.path.join(out_dir, "concatenated.fna") - with open(cfasta, "wt") as concat_out: - for ix, h in enumerate(contignames_all): - bin_ix = contig_labels[ix] - if total_size[bin_ix] < minfasta: - continue - concat_out.write(f">bin{bin_ix:06}.{h}\n") - concat_out.write(contig_dict[contignames_all[ix]] + "\n") - - logger.info("\tStarting searching for markers") - - if hmmout_path is not None: - logger.info(f"\t\thmmout file is provided at {hmmout_path}") - seeds = cal_num_bins( - cfasta, - binned_length, - num_process, - output=out_dir, - markers_path=hmmout_path, - clusters_dict=clusters_labels_map, - multi_mode=True, + markers: Markers, + latent: np.ndarray, + algorithm: Union[KmeansAlgorithm, DBScanAlgorithm], +) -> list[set[ContigId]]: + assert np.issubdtype(algorithm.contiglengths.dtype, np.integer) + assert np.issubdtype(latent.dtype, np.floating) + + if not (len(algorithm.contiglengths) == markers.n_seqs == len(latent)): + raise ValueError( + "Number of elements in contiglengths, markers and latent must match" ) - else: - seeds = cal_num_bins( - cfasta, - binned_length, - num_process, - output=out_dir, - multi_mode=True, + + # Simply dispatch to the right implementation based on the algorithm used + if isinstance(algorithm, KmeansAlgorithm): + return recluster_kmeans( + algorithm.clusters, + latent, + algorithm.contiglengths, + markers, + algorithm.random_seed, ) - # we cannot bypass the orf_finder here, because of the renaming of the contigs - if seeds == []: - logger.info("\tNo bins found in the concatenated fasta file.") - return contig_labels - logger.info("\tFinished searching for markers") - logger.info(f"\tFound {len(seeds)} seeds") - - if algorithm == "dbscan": - contig2marker = get_marker( - hmmout_path, - min_contig_len=binned_length, - fasta_path=cfasta, - orf_finder="prodigal", - clusters_dict=clusters_labels_map, - contig_to_marker=True, + elif isinstance(algorithm, DBScanAlgorithm): + assert len(algorithm.taxonomy.contig_taxonomies) == markers.n_seqs + return recluster_dbscan( + algorithm.taxonomy, + latent, + algorithm.contiglengths, + markers, + algorithm.n_processes, ) - logger.info("\tRunning DBSCAN with cosine distance") - extracted_all = [] - for k, v in labels_cluster_map.items(): - logger.info(f"\tLabel {k}, {len(v)} contigs") - indices = [indices_contigs[c] for c in v] - contignames = contignames_all[indices] - embedding_new = embedding[indices] - DBSCAN_results_dict = {} - distance_matrix = pairwise_distances( - embedding_new, embedding_new, metric="cosine" - ) - length_weight = np.array([len(contig_dict[name]) for name in contignames]) - for eps_value in np.arange(0.01, 0.35, 0.02): - dbscan = DBSCAN( - eps=eps_value, - min_samples=5, - n_jobs=num_process, - metric="precomputed", - ) - dbscan.fit(distance_matrix, sample_weight=length_weight) - labels = dbscan.labels_ - logger.info(f"\t\tepsilon {eps_value}, {len(set(labels))} labels") - DBSCAN_results_dict[eps_value] = labels.tolist() - - logger.info("\tIntegrating results") - - extracted = [] - contignames_list = list(contignames) - while ( - sum(len(contig_dict[contig]) for contig in contignames_list) >= minfasta - ): - if len(contignames_list) == 1: - extracted.append(contignames_list) - break - - max_bin = get_best_bin( - DBSCAN_results_dict, - contig2marker, - contignames_list, - contig_dict, - minfasta, - ) - if not max_bin: - break - - extracted.append(max_bin) - for temp in max_bin: - temp_index = contignames_list.index(temp) - contignames_list.pop(temp_index) - for eps_value in DBSCAN_results_dict: - DBSCAN_results_dict[eps_value].pop(temp_index) - extracted_all.extend(extracted) - - contig2ix = {} - logger.info(f"\t{len(extracted_all)} extracted clusters") - for i, cs in enumerate(extracted_all): - for c in cs: - contig2ix[c] = i - namelist = contignames_all.copy() - contig_labels_reclustered = [contig2ix.get(c, -1) for c in namelist] - elif algorithm == "kmeans": - name2ix = {name: ix for ix, name in enumerate(contignames_all)} - contig_labels_reclustered = np.empty_like(contig_labels) - contig_labels_reclustered.fill(-1) - next_label = 0 - for bin_ix in range(contig_labels.max() + 1): - seed = seeds.get(f"bin{bin_ix:06}", []) - num_bin = len(seed) - - if num_bin > 1 and total_size[bin_ix] >= minfasta: - contig_indices = [ - i for i, ell in enumerate(contig_labels) if ell == bin_ix - ] - re_bin_features = embedding[contig_indices] - - seed_index = [name2ix[s] for s in seed] - length_weight = np.array( - [ - len(contig_dict[name]) - for name, ell in zip(contignames_all, contig_labels) - if ell == bin_ix - ] - ) - seeds_embedding = embedding[seed_index] - logger.info(f"\t\tStarting K-means reclutering for bin {bin_ix}") - kmeans = KMeans( - n_clusters=num_bin, - init=seeds_embedding, - n_init=1, - random_state=random_seed, - ) - kmeans.fit(re_bin_features, sample_weight=length_weight) - logger.info("\t\tFinished K-means reclutering") - for i, label in enumerate(kmeans.labels_): - contig_labels_reclustered[contig_indices[i]] = next_label + label - next_label += num_bin - else: - contig_labels_reclustered[contig_labels == bin_ix] = next_label - next_label += 1 - assert contig_labels_reclustered.min() >= 0 - os.remove(cfasta) - else: - raise AssertionError( - "Reclustering algorithm must be one of ['dbscan', 'kmeans']" + + +def recluster_kmeans( + clusters: list[set[ContigId]], + latent: np.ndarray, + contiglengths: np.ndarray, + markers: Markers, + random_seed: int, +) -> list[set[ContigId]]: + assert len(latent) == len(contiglengths) == markers.n_seqs + assert np.issubdtype(contiglengths.dtype, np.integer) + assert np.issubdtype(latent.dtype, np.floating) + assert latent.ndim == 2 + + result: list[set[ContigId]] = [] + indices_by_medoid: dict[int, set[ContigId]] = defaultdict(set) + # We loop over all existing clusters, and determine if they should be split, + # by looking at the median number of single-copy genes in the cluster + for cluster in clusters: + # All clusters with 1 contig by definition cannot have multiple single-copy + # genes (because SCGs are deduplicated within a single contig) + if len(cluster) == 1: + result.append(cluster) + continue + # Get a count of each marker and compute the median count of SCGs + counts = count_markers(cluster, markers) + cp = counts.copy() + cp.sort() + median_counts: int = cp[len(cp) // 2] + # If we have less than 2 SCGs on average, the cluster should not be split, + # and we emit it unchanged + if median_counts < 2: + result.append(cluster) + continue + + # Run K-means with the median number of SCGs to split the contig. + # We weigh the contigs by length. + seeds = get_kmeans_seeds( + cluster, + markers, + contiglengths, # type: ignore + counts, + median_counts, + ) + + cluster_indices = np.array(list(cluster)) + cluter_latent = latent[cluster_indices] + cluster_lengths = contiglengths[cluster_indices] + seed_latent = latent[seeds] + kmeans = KMeans( + n_clusters=median_counts, + init=seed_latent, + n_init=1, + random_state=random_seed, + ) + kmeans.fit(cluter_latent, sample_weight=cluster_lengths) + indices_by_medoid.clear() + for cluster_label, index in zip(kmeans.labels_, cluster_indices): + indices_by_medoid[cluster_label].add(ContigId(index)) + result.extend(indices_by_medoid.values()) + + return result + + +# Get a vector of counts, of each SCG, +# where if MarkerID(5) is seen 9 times, then counts[5] == 9. +def count_markers( + contigs: Iterable[ContigId], + markers: Markers, +) -> np.ndarray: + counts = np.zeros(markers.n_markers, dtype=np.int32) + for contig in contigs: + m = markers.markers[contig] + if m is not None: + counts[m] += 1 + return counts + + +# Same as above, but once we see a very high number of marker genes, +# we bail. This is because a large fraction of time spent in this module +# would otherwise be counting markers of huge clusters, long after we already +# know it's hopelessly contaminated +def count_markers_saturated( + contigs: Iterable[ContigId], + markers: Markers, +) -> Optional[np.ndarray]: + counts = np.zeros(markers.n_markers, dtype=np.int32) + n_markers = 0 + n_unique = 0 + # This implies contamination == 1.0 + max_duplicates = 1 * markers.n_markers + for contig in contigs: + m = markers.markers[contig] + if m is not None: + n_markers += len(m) + for i in m: + existing = counts[i] + n_unique += existing == 0 + counts[i] = existing + 1 + + if (n_markers - n_unique) > max_duplicates: + return None + return counts + + +# This is not very effectively implemented, but I assume it does not matter. +# This function looks at all markers that occur exactly `median` times, each of these +# markers corresponding to a list of `median` number of contigs. +# It picks the marker for which the smallest contig that contains it is largest. +# The idea here is that long contigs, which contain one of the SCGs that exist exactly +# `median` times are most likely to be close to the actual medoid +# that Kmeans needs to find. +# This is just one possible seeding strategy. We could also plausibly choose e.g. +# the trio of contigs that have the most SCGs. +def get_kmeans_seeds( + contigs: Iterable[ContigId], + markers: Markers, + contiglengths: Sequence[int], + counts: np.ndarray, + median: int, +) -> list[ContigId]: + considered_markers = {MarkerID(i) for (i, c) in enumerate(counts) if c == median} + contigs_of_markers: dict[MarkerID, list[ContigId]] = defaultdict(list) + for contig in contigs: + m = markers.markers[contig] + if m is None: + continue + for mid in m: + if mid not in considered_markers: + continue + contigs_of_markers[MarkerID(mid)].append(contig) + + candidate_list = list(contigs_of_markers.items()) + pair = max(candidate_list, key=lambda x: min(contiglengths[i] for i in x[1])) + result = pair[1] + assert len(result) == median + return result + + +def get_completeness_contamination(counts: np.ndarray) -> tuple[float, float]: + n_total = counts.sum() + n_unique = (counts > 0).sum() + completeness = n_unique / len(counts) + contamination = (n_total - n_unique) / len(counts) + return (completeness, contamination) + + +# An arbitrary score of a bin, where higher numbers is better. +# completeness - 5 * contamination is used by the CheckM group as a heuristic. +def score_from_comp_cont(comp_cont: tuple[float, float]) -> float: + return comp_cont[0] - 5 * comp_cont[1] + + +def recluster_dbscan( + taxonomy: Taxonomy, + latent: np.ndarray, + contiglengths: np.ndarray, + markers: Markers, + num_processes: int, +) -> list[set[ContigId]]: + # Since DBScan is computationally expensive, and scales poorly with the number + # of contigs, we use taxonomy to only cluster within each genus + result: list[set[ContigId]] = [] + for indices in group_indices_by_genus(taxonomy): + genus_latent = latent[indices] + genus_clusters = dbscan_genus( + genus_latent, indices, contiglengths[indices], markers, num_processes + ) + result.extend(genus_clusters) + + return result + + +# DBScan within the subset of contigs that are annotated with a single genus +def dbscan_genus( + latent_of_genus: np.ndarray, + original_indices: np.ndarray, + contiglengths_of_genus: np.ndarray, + markers: Markers, + num_processes: int, +) -> list[set[ContigId]]: + assert len(latent_of_genus) == len(original_indices) == len(contiglengths_of_genus) + # Precompute distance matrix. This is O(N^2), but DBScan is even worse, + # so this pays off. + # TODO: Maybe we should emit a warning if this function is called with too + # many points such that this matrix becomes huge? + distance_matrix = pairwise_distances( + latent_of_genus, latent_of_genus, metric="cosine" + ) + best_bins: tuple[int, dict[int, set[ContigId]]] = (-1, dict()) + # The DBScan approach works by blindly clustering with different eps values + # (a critical parameter for DBscan), and then using SCGs to select the best + # subset of clusters. + # It's ugly and wasteful, but it does work. + n_worse_in_row = 0 + for eps in EPS_VALUES: + print(f"Running eps: {eps}") + dbscan = DBSCAN( + eps=eps, + min_samples=5, + n_jobs=num_processes, + metric="precomputed", ) - return contig_labels_reclustered + dbscan.fit(distance_matrix, sample_weight=contiglengths_of_genus) + these_bins: dict[int, set[ContigId]] = defaultdict(set) + for original_index, bin_index in zip(original_indices, dbscan.labels_): + these_bins[bin_index].add(ContigId(original_index)) + score = count_good_genomes(these_bins.values(), markers) + print(f"Score: {score}, best: {best_bins[0]}, worse: {n_worse_in_row}") + if score > best_bins[0]: + best_bins = (score, these_bins) + n_worse_in_row = 0 + elif score < best_bins[0]: + # This is an elif statement and not an if statement because we don't + # want e.g. a series of [1,1,1,1,1] genomes to be considered a performance + # degradation leading to early exit, when in fact, it probably means + # that eps is too low and should be increased. + n_worse_in_row += 1 + # If we see 3 worse clusterings in a row, we exit early. + + if n_worse_in_row > 2: + break + + return list(best_bins[1].values()) + + +def count_good_genomes(binning: Iterable[Iterable[ContigId]], markers: Markers) -> int: + max_contamination = 0.3 + min_completeness = 0.75 + result = 0 + for contigs in binning: + count = count_markers_saturated(contigs, markers) + if count is None: + continue + (comp, cont) = get_completeness_contamination(count) + if comp >= min_completeness and cont <= max_contamination: + result += 1 + + return result + + +def group_indices_by_genus( + taxonomy: Taxonomy, +) -> list[np.ndarray]: + if not taxonomy.is_canonical: + raise ValueError("Can only group by genus for a canonical taxonomy") + by_genus: dict[Optional[str], list[ContigId]] = defaultdict(list) + for i, tax in enumerate(taxonomy.contig_taxonomies): + genus = None if tax is None else tax.genus + by_genus[genus].append(ContigId(i)) + return [np.array(i, dtype=np.int32) for i in by_genus.values()] diff --git a/vamb/taxonomy.py b/vamb/taxonomy.py new file mode 100644 index 00000000..7e88919d --- /dev/null +++ b/vamb/taxonomy.py @@ -0,0 +1,175 @@ +from typing import Optional, IO +from pathlib import Path +from vamb.parsecontigs import CompositionMetaData +import numpy as np + + +class ContigTaxonomy: + """ + Hierarchical taxonomy of some contig. + If `is_canonical`, the ranks are assumed to be domain, phylum, class, + order, family, genus, species, in that order. + The taxonomy may be arbitrarily truncated, e.g. ["Eukaryota", "Chordata"] + is a valid (canonical) taxonomy for a human. + """ + + __slots__ = ["ranks"] + + def __init__(self, ranks: list[str], is_canonical: bool = False): + if is_canonical and len(ranks) > 7: + raise ValueError( + "For a canonical ContigTaxonomy, there must be at most 7 ranks" + ) + + self.ranks = ranks + + @classmethod + def from_semicolon_sep(cls, s: str, is_canonical: bool = False): + return cls(s.split(";"), is_canonical) + + @property + def genus(self) -> Optional[str]: + if len(self.ranks) < 6: + return None + return self.ranks[5] + + +class Taxonomy: + """ + * contig_taxonomies: An Optional[ContigTaxonomy] for every contig given by the + CompositionMetaData used to instantiate + * refhash: Refhash of CompositionMetaData used to instantiate + * is_canonical: If the taxonomy uses the canonical seven ranks + (domain, phylum, class, order, family, genus, species). + """ + + __slots__ = ["contig_taxonomies", "refhash", "is_canonical"] + + @property + def nseqs(self) -> int: + return len(self.contig_taxonomies) + + @classmethod + def from_file( + cls, tax_file: Path, metadata: CompositionMetaData, is_canonical: bool + ): + observed = cls.parse_tax_file(tax_file, is_canonical) + return cls.from_observed(observed, metadata, is_canonical) + + @classmethod + def from_observed( + cls, + observed_taxonomies: list[tuple[str, ContigTaxonomy]], + metadata: CompositionMetaData, + is_canonical: bool, + ): + index_of_contigname: dict[str, int] = { + c: i for (i, c) in enumerate(metadata.identifiers) + } + contig_taxonomies: list[Optional[ContigTaxonomy]] = [None] * len( + metadata.identifiers + ) + for contigname, taxonomy in observed_taxonomies: + index = index_of_contigname.get(contigname) + if index is None: + raise ValueError( + f'When parsing taxonomy, found contigname "{contigname}", ' + "but no sequence of that name is in the FASTA file" + ) + existing = contig_taxonomies[index] + if existing is not None: + raise ValueError( + f'Duplicate contigname when parsing taxonomy: "{contigname}"' + ) + contig_taxonomies[index] = taxonomy + return cls(contig_taxonomies, metadata.refhash, is_canonical) + + def __init__( + self, + contig_taxonomies: list[Optional[ContigTaxonomy]], + refhash: bytes, + is_canonical: bool, + ): + self.contig_taxonomies = contig_taxonomies + self.refhash = refhash + self.is_canonical = is_canonical + + @staticmethod + def parse_tax_file( + path: Path, force_canonical: bool + ) -> list[tuple[str, ContigTaxonomy]]: + with open(path) as file: + result: list[tuple[str, ContigTaxonomy]] = [] + lines = filter(None, map(str.rstrip, file)) + header = next(lines, None) + if header is None or not header.startswith("contigs\tpredictions"): + raise ValueError( + 'In taxonomy file, expected header to begin with "contigs\\tpredictions"' + ) + for line in lines: + (contigname, taxonomy, *_) = line.split("\t") + result.append( + ( + contigname, + ContigTaxonomy.from_semicolon_sep(taxonomy, force_canonical), + ) + ) + + return result + + +class PredictedContigTaxonomy: + slots = ["contig_taxonomy", "probs"] + + def __init__(self, tax: ContigTaxonomy, probs: np.ndarray): + if len(probs) != len(tax.ranks): + raise ValueError("The length of probs must equal that of ranks") + self.tax = tax + self.probs = probs + + +class PredictedTaxonomy: + "Output of Taxometer" + + __slots__ = ["contig_taxonomies", "refhash", "is_canonical"] + + def __init__( + self, + taxonomies: list[PredictedContigTaxonomy], + metadata: CompositionMetaData, + is_canonical: bool, + ): + if len(taxonomies) != len(metadata.identifiers): + raise ValueError("Length of taxonomies must match that of identifiers") + + self.contig_taxonomies = taxonomies + self.refhash = metadata.refhash + self.is_canonical = is_canonical + + def to_taxonomy(self) -> Taxonomy: + lst: list[Optional[ContigTaxonomy]] = [p.tax for p in self.contig_taxonomies] + return Taxonomy(lst, self.refhash, self.is_canonical) + + @property + def nseqs(self) -> int: + return len(self.contig_taxonomies) + + def write_as_tsv(self, file: IO[str], comp_metadata: CompositionMetaData): + if self.refhash != comp_metadata.refhash: + raise ValueError( + "Refhash of comp_metadata and predicted taxonomy must match" + ) + assert self.nseqs == comp_metadata.nseqs + print("contigs\tpredictions\tlengths\tscores", file=file) + for i in range(self.nseqs): + tax = self.contig_taxonomies[i] + ranks_str = ";".join(tax.tax.ranks) + probs_str = ";".join([str(round(i, 5)) for i in tax.probs]) + print( + comp_metadata.identifiers[i], + ranks_str, + comp_metadata.lengths[i], + probs_str, + file=file, + sep="\t", + ) diff --git a/vamb/taxvamb_encode.py b/vamb/taxvamb_encode.py index 79ea5333..1541b751 100644 --- a/vamb/taxvamb_encode.py +++ b/vamb/taxvamb_encode.py @@ -23,7 +23,7 @@ import vamb.semisupervised_encode as _semisupervised_encode import vamb.encode as _encode import vamb.hloss_misc as _hloss -from vamb.vambtools import ContigTaxonomy +from vamb.taxonomy import ContigTaxonomy def make_graph( diff --git a/vamb/vambtools.py b/vamb/vambtools.py index c60d9398..b43c683c 100644 --- a/vamb/vambtools.py +++ b/vamb/vambtools.py @@ -9,7 +9,7 @@ import collections as _collections from itertools import zip_longest from hashlib import md5 as _md5 -from collections.abc import Iterable, Iterator, Sequence +from collections.abc import Iterable, Iterator from typing import Optional, IO, Union from pathlib import Path from loguru import logger @@ -223,66 +223,6 @@ def clear(self) -> None: self.length = 0 -class ContigTaxonomy: - """ - Hierarchical taxonomy of some contig. - If `is_canonical`, the ranks are assumed to be domain, phylum, class, - order, family, genus, species, in that order. - The taxonomy may be arbitrarily truncated, e.g. ["Eukaryota", "Chordata"] - is a valid (canonical) taxonomy for a human. - """ - - __slots__ = ["is_canonical", "ranks"] - - def __init__(self, ranks: list[str], is_canonical: bool = False): - if is_canonical and len(ranks) > 7: - raise ValueError( - "For a canonical ContigTaxonomy, there must be at most 7 ranks" - ) - - self.ranks = ranks - self.is_canonical = is_canonical - - @classmethod - def from_semicolon_sep(cls, s: str, is_canonical: bool = False): - return cls(s.split(";"), is_canonical) - - @property - def genus(self) -> Optional[str]: - if not self.is_canonical: - raise ValueError("Taxonomy must be canonical to get genus") - if len(self.ranks) < 6: - return None - return self.ranks[5] - - -def parse_taxonomy( - input_file: Path, contignames: Sequence[str], is_canonical: bool -) -> list[Optional[ContigTaxonomy]]: - result: list[Optional[ContigTaxonomy]] = [None] * len(contignames) - index_of_contigname = {c: i for (i, c) in enumerate(contignames)} - with open(input_file) as file: - lines = filter(None, map(str.strip, file)) - header = next(lines) - assert header.startswith("contigs\tpredictions") - for line in lines: - (contigname, taxonomy, *_) = line.split("\t") - index = index_of_contigname.get(contigname) - if index is None: - raise ValueError( - f'When parsing taxonomy, found contigname "{contigname}", ' - "but no sequence of that name is in the FASTA file" - ) - existing = result[index] - if existing is not None: - raise ValueError( - f'Duplicate contigname when parsing taxonomy: "{contigname}"' - ) - result[index] = ContigTaxonomy.from_semicolon_sep(taxonomy, is_canonical) - - return result - - def zscore( array: _np.ndarray, axis: Optional[int] = None, inplace: bool = False ) -> _np.ndarray: