diff --git a/.gitignore b/.gitignore index dbe311ab..335cd661 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ TODO.md build/ dist/ **.vscode +doc/_build diff --git a/README.md b/README.md index 09344df2..1c9fa478 100644 --- a/README.md +++ b/README.md @@ -92,17 +92,17 @@ spades.py --meta /path/to/reads/sample1.fw.fq.gz /path/to/reads/sample1.rv.fq.gz python concatenate.py /path/to/catalogue.fna.gz /path/to/assemblies/sample1/contigs.fasta /path/to/assemblies/sample2/contigs.fasta [ ... ] ``` -3. Use your favorite aligner to map each of your read files back to the FASTA file: +3. Use your favorite aligner to map each of your read files back to the FASTA file. + We recommend using `strobealign` with the `--aemb` flag, since this is extremely fast, and mapping is a significant bottleneck when running Vamb: ``` -minimap2 -d catalogue.mmi /path/to/catalogue.fna.gz; # make index -minimap2 -t 8 -N 5 -ax sr catalogue.mmi --split-prefix mmsplit /path/to/reads/sample1.fw.fq.gz /path/to/reads/sample1.rv.fq.gz | samtools view -F 3584 -b --threads 8 > /path/to/bam/sample1.bam +strobealign -t 8 --aemb /path/to/catalogue.fna.gz /path/to/reads/sample1.fw.fq.gz /path/to/reads/sample1.rv.fq.gz > aembdir/sample1.aemb.tsv ``` 4. Run Vamb: ``` -vamb bin default --outdir path/to/outdir --fasta /path/to/catalogue.fna.gz --bamdir /path/to/bams -o C +vamb bin default --outdir path/to/outdir --fasta /path/to/catalogue.fna.gz --aemb aembdir ``` 5. Apply any desired postprocessing to Vamb's output. @@ -164,29 +164,27 @@ Vamb is fairly memory efficient, and we have run Vamb with 1000 samples and 5.9 If you have a dataset too large to fit in RAM and feel the temptation to bin each sample individually, you can instead use a tool like `sourmash` to group similar samples together in smaller batches, bin these batches individually. This way, you can still leverage co-abundance, and will get much better results. -__4) Map the reads to the FASTA file to obtain BAM files__ - -:warning: *Important:* Vamb only accepts BAM files sorted by coordinate. You can sort BAM files with `samtools sort`. - -Be careful to choose proper parameters for your aligner - in general, if reads from contig A align to contig B, then Vamb will bin A and B together. So your aligner should map reads with the same level of discrimination that you want Vamb to use. Although you can use any aligner that produces a specification-compliant BAM file, we prefer using `minimap2` (though be aware of [this annoying bug in minimap2](https://github.com/lh3/minimap2/issues/15) - see our Snakemake script for how to work around it): +__4) Map the reads to the FASTA file to obtain abundances__ +Be careful to choose proper parameters for your aligner - in general, if reads from contig A align to contig B, then Vamb will bin A and B together. +So your aligner should map reads with the same level of discrimination that you want Vamb to use. +You can use either a mapper that produces a specification-compliant BAM file, +but we prefer using `strobealign` with the `--aemb` flag, which is insanely fast, and produces abundance estimations directly: ``` -minimap2 -d catalogue.mmi /path/to/catalogue.fna.gz; # make index -minimap2 -t 28 -N 5 -ax sr catalogue.mmi --split-prefix mmsplit sample1.forward.fastq.gz sample1.reverse.fastq.gz | samtools view -F 3584 -b --threads 8 > sample1.bam +strobealign -t 8 --aemb /path/to/catalogue.fna.gz /path/to/reads/sample1.fw.fq.gz /path/to/reads/sample1.rv.fq.gz > aembdir/sample1.aemb.tsv ``` -:warning: *Important:* Do *not* filter the aligments for mapping quality as specified by the MAPQ field of the BAM file. This field gives the probability that the mapping position is correct, which is influenced by the number of alternative mapping locations. Filtering low MAPQ alignments away removes alignments to homologous sequences which biases the depth estimation. - -If you are using BAM files where you do not trust the validity of every alignment in the file, you can filter the alignments for minimum nucleotide identity using the `-z` option. +:warning: *Important:* If you use BAM files as input instead of the output of `strobealign --aemb`, Vamb needs BAM files sorted by coordinate. You can sort BAM files with `samtools sort`. __5) Run Vamb__ By default, Vamb does not output any FASTA files of the bins. In the examples below, the option `--minfasta 200000` is set, meaning that all bins with a size of 200 kbp or more will be output as FASTA files. Run Vamb with: -`vamb bin default -o SEP --outdir OUT --fasta FASTA --bamdir /dir/with/bamfiles --minfasta 200000`, +`vamb bin default -o SEP --outdir OUT --fasta FASTA --aemb AEMBDIR`, -where `SEP` in the {Separator} chosen in step 3, e.g. `C` in that example, `OUT` is the name of the output directory to create, `FASTA` the path to the FASTA file and `BAM1` the path to the first BAM file. You can also use shell globbing to input multiple BAM files: `my_bamdir/*bam`. +where `SEP` in the {Separator} chosen in step 3, e.g. `C` in that example (it defaults to 'C'), `OUT` is the name of the output directory to create, `FASTA` the path to the FASTA file and `AEMBDIR` the path to a directory with the .tsv files created by `strobealign` in step 4. +If you input BAM files, e.g. from a mapper other than `strobealign`, use `--bamdir` instead of `--aemb` to specify the input to Vamb. Note that if you provide insufficient memory (RAM) to Vamb, it will run very slowly due to [thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science)). Make sure you don't run out of RAM! diff --git a/test/data/aemb/6.aemb.tsv b/test/data/aemb/6.aemb.tsv new file mode 100644 index 00000000..7a5fbc38 --- /dev/null +++ b/test/data/aemb/6.aemb.tsv @@ -0,0 +1,12 @@ +S27C95602 5.988746 +S27C25358 25.066412 +S27C181335 1.159981 +S4C222286 33.167842 +S11C13125 6.825609 +S4C480978 6.578677 +S12C228927 6.716019 +S27C93037 13.361650 +S9C124493 16.475576 +S27C214882 6.249275 +S7C273086 3.115955 +S12C85159 3.793851 diff --git a/test/data/aemb/7.aemb.tsv b/test/data/aemb/7.aemb.tsv new file mode 100644 index 00000000..4cfce44c --- /dev/null +++ b/test/data/aemb/7.aemb.tsv @@ -0,0 +1,12 @@ +S27C95602 0.000000 +S27C25358 40.494834 +S27C181335 1.123731 +S4C222286 41.094294 +S11C13125 0.000000 +S4C480978 0.000617 +S12C228927 0.000000 +S27C93037 0.084345 +S9C124493 0.000000 +S27C214882 1.003976 +S7C273086 2.705933 +S12C85159 4.306267 diff --git a/test/data/aemb/8.aemb.tsv b/test/data/aemb/8.aemb.tsv new file mode 100644 index 00000000..78c2f5e2 --- /dev/null +++ b/test/data/aemb/8.aemb.tsv @@ -0,0 +1,12 @@ +S27C95602 0.000000 +S27C25358 2.157007 +S27C181335 5.691155 +S4C222286 35.064668 +S11C13125 0.000000 +S4C480978 0.000000 +S12C228927 0.000000 +S27C93037 0.099672 +S9C124493 0.578131 +S27C214882 0.000000 +S7C273086 1.028035 +S12C85159 2.950335 diff --git a/test/test_parsebam.py b/test/test_parsebam.py index 0fd0c647..7dfe619c 100644 --- a/test/test_parsebam.py +++ b/test/test_parsebam.py @@ -1,6 +1,8 @@ import unittest import io import numpy as np +import random +import tempfile import vamb import testtools @@ -102,3 +104,60 @@ def test_save_load(self): self.assertTrue(np.all(abundance2.samplenames == self.abundance.samplenames)) self.assertEqual(abundance2.refhash, self.abundance.refhash) self.assertEqual(abundance2.minid, self.abundance.minid) + + def test_parse_from_aemb(self): + abundance = vamb.parsebam.Abundance.from_aemb( + testtools.AEMB_FILES, self.comp_metadata + ) + + self.assertTrue(abundance.refhash == self.comp_metadata.refhash) + self.assertTrue( + (abundance.samplenames == [str(i) for i in testtools.AEMB_FILES]).all() + ) + + manual_matrix = [] + for file in testtools.AEMB_FILES: + manual_matrix.append([]) + with open(file) as f: + for line in f: + (_, ab) = line.split() + manual_matrix[-1].append(float(ab)) + manual_matrix = list(map(list, zip(*manual_matrix))) + + self.assertTrue((np.abs(manual_matrix - abundance.matrix) < 1e-5).all()) + + # Good headers, but in other order + names = list(self.comp_metadata.identifiers) + random.shuffle(names) + files = [self.make_aemb_file(names) for i in range(2)] + a = vamb.parsebam.Abundance.from_aemb( + [f.name for f in files], self.comp_metadata + ) + self.assertIsInstance(a, vamb.parsebam.Abundance) + + def make_aemb_file(self, headers): + file = tempfile.NamedTemporaryFile(mode="w+") + for header in headers: + n = random.random() * 10 + print(header, "\t", str(n), file=file) + file.seek(0) + return file + + def test_bad_aemb(self): + # One header is wrong + names = list(self.comp_metadata.identifiers) + names[-4] = names[-4] + "a" + files = [self.make_aemb_file(names) for i in range(2)] + with self.assertRaises(ValueError): + vamb.parsebam.Abundance.from_aemb( + [f.name for f in files], self.comp_metadata + ) + + # Too many headers + names = list(self.comp_metadata.identifiers) + names.append(names[0]) + files = [self.make_aemb_file(names) for i in range(2)] + with self.assertRaises(ValueError): + vamb.parsebam.Abundance.from_aemb( + [f.name for f in files], self.comp_metadata + ) diff --git a/test/testtools.py b/test/testtools.py index 7fa2ef43..1334f56e 100644 --- a/test/testtools.py +++ b/test/testtools.py @@ -12,6 +12,8 @@ for i in os.listdir(os.path.join(DATADIR, "bam")) ] ) +AEMB_DIR = os.path.join(DATADIR, "aemb") +AEMB_FILES = sorted([pathlib.Path(AEMB_DIR).joinpath(i) for i in os.listdir(AEMB_DIR)]) BAM_NAMES = [ "S27C175628", diff --git a/vamb/__main__.py b/vamb/__main__.py index 8933e359..afc6884f 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -108,6 +108,14 @@ class AbundancePath(type(Path())): pass +class BAMPaths(list): + pass + + +class AEMBPaths(list): + pass + + class AbundanceOptions: __slots__ = ["path", "min_alignment_id", "refcheck"] @@ -115,22 +123,27 @@ def __init__( self, bampaths: Optional[list[Path]], bamdir: Optional[Path], + aemb: Optional[Path], abundancepath: Optional[Path], min_alignment_id: Optional[float], refcheck: bool, ): assert isinstance(bampaths, (list, type(None))) assert isinstance(bamdir, (Path, type(None))) + assert isinstance(aemb, (Path, type(None))) assert isinstance(abundancepath, (Path, type(None))) assert isinstance(min_alignment_id, (float, type(None))) assert isinstance(refcheck, bool) # Make sure only one RPKM input is there if ( - (bampaths is not None) + (abundancepath is not None) + (bamdir is not None) + (bampaths is not None) + + (abundancepath is not None) + + (bamdir is not None) + + (aemb is not None) ) != 1: raise argparse.ArgumentTypeError( - "Must specify exactly one of BAM files, BAM dir or abundance NPZ file input" + "Must specify exactly one of BAM files, BAM dir, AEMB or abundance NPZ file input" ) if abundancepath is not None: @@ -161,6 +174,18 @@ def __init__( f'Not an existing non-directory file: "{str(bampath)}"' ) self.add_existing_bam_paths(bampaths) + elif aemb is not None: + if not aemb.is_dir(): + raise NotADirectoryError( + f"Directory '{aemb}' passed with --aemb is not an existing directory." + ) + paths = [p for p in aemb.iterdir() if p.is_file() and p.suffix == ".tsv"] + if len(paths) == 0: + raise ValueError( + f"No `.tsv` files found in --aemb argument {aemb}. " + "Make sure all TSV files in the directory ends with `.tsv`." + ) + self.path = AEMBPaths(paths) if min_alignment_id is not None: if bampaths is None: @@ -185,7 +210,7 @@ def add_existing_bam_paths(self, paths: list[Path]): for bampath in paths: if not pycoverm.is_bam_sorted(str(bampath)): raise ValueError(f"Path {bampath} is not sorted by reference.") - self.path = paths + self.path = BAMPaths(paths) class VAEOptions: @@ -631,13 +656,12 @@ def calc_rpkm( f"Loaded abundance has {abundance.nseqs} sequences, " f"but composition has {comp_metadata.nseqs}." ) - else: - assert isinstance(path, list) + elif isinstance(path, BAMPaths): logger.info(f"\tParsing {len(path)} BAM files with {nthreads} threads") logger.info(f"\tMin identity: {abundance_options.min_alignment_id}") abundance = vamb.parsebam.Abundance.from_files( - path, + list(path), outdir.joinpath("tmp").joinpath("pycoverm"), comp_metadata, abundance_options.refcheck, @@ -649,6 +673,20 @@ def calc_rpkm( logger.info("\tOrder of columns is:") for i, samplename in enumerate(abundance.samplenames): logger.info(f"\t{i:>6}: {samplename}") + elif isinstance(path, AEMBPaths): + paths = list(path) + logger.info(f"\tParsing {len(paths)} AEMB TSV files") + abundance = vamb.parsebam.Abundance.from_aemb( + paths, + comp_metadata, + ) + abundance.save(outdir.joinpath("abundance.npz")) + logger.info("\tOrder of columns is:") + for i, samplename in enumerate(abundance.samplenames): + logger.info(f"\t{i:>6}: {samplename}") + + else: + assert False elapsed = round(time.time() - begintime, 2) logger.info(f"\tProcessed RPKM in {elapsed} seconds.\n") @@ -1414,6 +1452,7 @@ def __init__(self, args): self.abundance_options = AbundanceOptions( self.args.bampaths, self.args.bamdir, + self.args.aemb, self.args.abundancepath, self.args.min_alignment_id, not self.args.norefcheck, @@ -1680,6 +1719,12 @@ def add_input_output_arguments(subparser): type=Path, help="Dir with .bam files to use", ) + rpkmos.add_argument( + "--aemb", + metavar="", + type=Path, + help="Dir with .tsv files from strobealign --aemb", + ) rpkmos.add_argument( "--rpkm", metavar="", diff --git a/vamb/aamb_encode.py b/vamb/aamb_encode.py index 19f4ce85..6ac7521f 100644 --- a/vamb/aamb_encode.py +++ b/vamb/aamb_encode.py @@ -261,7 +261,8 @@ def trainmodel( for epoch_i in range(nepochs): if epoch_i in batchsteps: data_loader = set_batchsize( - data_loader, data_loader.batch_size * 2 # type:ignore + data_loader, + data_loader.batch_size * 2, # type:ignore ) ( diff --git a/vamb/encode.py b/vamb/encode.py index d6afbb96..b52cbcd3 100644 --- a/vamb/encode.py +++ b/vamb/encode.py @@ -378,7 +378,8 @@ def trainepoch( if epoch in batchsteps: data_loader = set_batchsize( - data_loader, data_loader.batch_size * 2 # type:ignore + data_loader, + data_loader.batch_size * 2, # type:ignore ) for depths_in, tnf_in, abundance_in, weights in data_loader: diff --git a/vamb/parsebam.py b/vamb/parsebam.py index afa113f2..b875691b 100644 --- a/vamb/parsebam.py +++ b/vamb/parsebam.py @@ -232,3 +232,57 @@ def run_pycoverm( ) return (coverage, refhash) + + @classmethod + def from_aemb( + cls: type[A], + paths: list[Path], + comp_metadata: CompositionMetaData, + ) -> A: + # We cannot guarantee strobealign outputs the contigs in the right order, + # so we must reorder them. + identifier_to_index: dict[str, int] = { + s: i for (i, s) in enumerate(comp_metadata.identifiers) + } + + result = _np.empty((len(identifier_to_index), len(paths)), dtype=_np.float32) + for col_num, path in enumerate(paths): + column = _np.full(len(identifier_to_index), float("nan"), dtype=_np.float32) + with vambtools.Reader(path) as file: + n_lines_seen = 0 + for line in file: + line = line.rstrip() + if len(line) == 0: + continue + n_lines_seen += 1 + if n_lines_seen > len(column): + raise ValueError( + f'Too many rows in AEMB file "{path}", expected {len(column)}' + ) + # strobealign reserves the right to add more columns in future versions + (identifier, depth_str, *_) = line.split(maxsplit=2) + index = identifier_to_index.get(identifier.decode()) + if index is None: + raise ValueError( + f'Identifier "{identifier}" seen in `strobealign --aemb` output file "{path}", ' + "but this identifier is not seen in the FASTA file.\n" + "Verify that the AEMB file was generated by mapping to the same FASTA file " + "as was input to Vamb." + ) + column[index] = float(depth_str) + + isnans = _np.isnan(column) + if isnans.any(): + first_missing_index = _np.nonzero(isnans)[0][0] + missing_identifier = comp_metadata.identifiers[first_missing_index] + raise ValueError( + f'Identifier "{missing_identifier}" from the input FASTA file is missing from the AEMB file "{path}".' + ) + result[:, col_num] = column + + return cls( + result, + [str(i) for i in paths], + 0.0, + comp_metadata.refhash, + )