diff --git a/vamb/__main__.py b/vamb/__main__.py index 6441b3df..4617ce42 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -919,14 +919,6 @@ def run( comp_metadata = composition.metadata del composition, abundance - # Write contignames and contiglengths needed for dereplication purposes - np.savetxt( - vamb_options.out_dir.joinpath("contignames"), - comp_metadata.identifiers, - fmt="%s", - ) - np.savez(vamb_options.out_dir.joinpath("lengths.npz"), comp_metadata.lengths) - if vae_options is not None: assert latent is not None assert comp_metadata.nseqs == len(latent) diff --git a/vamb/parsecontigs.py b/vamb/parsecontigs.py index 5d127c81..7070b51b 100644 --- a/vamb/parsecontigs.py +++ b/vamb/parsecontigs.py @@ -1,10 +1,3 @@ -__doc__ = """Calculate tetranucleotide frequency from a FASTA file. - -Usage: ->>> with open('/path/to/contigs.fna') as filehandle -... tnfs, contignames, lengths = read_contigs(filehandle) -""" - import os as _os import numpy as _np import vamb.vambtools as _vambtools @@ -121,7 +114,7 @@ def save(self, io: Union[str, Path, IO[bytes]]): ) @classmethod - def load(cls, io: Union[str, IO[bytes]]): + def load(cls, io: Union[str, IO[bytes], Path]): arrs = _np.load(io, allow_pickle=True) metadata = CompositionMetaData( _vambtools.validate_input_array(arrs["identifiers"]), diff --git a/workflow_avamb/README.md b/workflow_avamb/README.md index bfc90de3..a8e69add 100644 --- a/workflow_avamb/README.md +++ b/workflow_avamb/README.md @@ -133,8 +133,6 @@ Avamb produces the following output files: - `aae_z_clusters.tsv`: file generated by clustering the AAE z latent space, where each row is a sequence: Left column for the cluster (i.e bin) name, right column for the sequence name. You can create the FASTA-file bins themselves using the script in `src/create_fasta.py` - `aae_z_latent.npz`: this contains the output of the AAE model z latent space. - `composition.npz`: a Numpy .npz file that contain all kmer composition information computed by Avamb from the FASTA file. This can be provided to another run of Avamb to skip the composition calculation step. - - `contignames`: text file containing a list of the contigs remaining after the minimum contig size allowed, and defined on the `min_contig_size` in the `config.json` file. - - `lengths.npz`: Numpy object that contains the contig length, same order than the contignames. - `log.txt`: a text file with information about the Avamb run. Look here (and at stderr) if you experience errors. - `model.pt`: a file containing the trained VAE model. When running Avamb from a Python interpreter, the VAE can be loaded from this file to skip training. - `aae_model.pt`: a file containing the trained AAE model. When running Avamb from a Python interpreter, the AAE can be loaded from this file to skip training. diff --git a/workflow_avamb/avamb.snake.conda.smk b/workflow_avamb/avamb.snake.conda.smk index c80cb046..0048260a 100644 --- a/workflow_avamb/avamb.snake.conda.smk +++ b/workflow_avamb/avamb.snake.conda.smk @@ -335,8 +335,6 @@ rule run_avamb: clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"), clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"), clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv"), - contignames=os.path.join(OUTDIR,"avamb/contignames"), - contiglenghts=os.path.join(OUTDIR,"avamb/lengths.npz") params: walltime="86400", nodes="1", @@ -471,8 +469,7 @@ rule create_cluster_scores_bin_path_dictionaries: rule run_drep_manual_vamb_z_y: input: cluster_score_dict_path_avamb=os.path.join(OUTDIR,"tmp/cs_d_avamb.json"), - contignames=os.path.join(OUTDIR,"avamb/contignames"), - contiglengths=os.path.join(OUTDIR,"avamb/lengths.npz"), + composition=os.path.join(OUTDIR,"avamb/composition.npz"), clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"), clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"), clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv") @@ -496,8 +493,8 @@ rule run_drep_manual_vamb_z_y: shell: """ - python {params.path} --cs_d {input.cluster_score_dict_path_avamb} --names {input.contignames}\ - --lengths {input.contiglengths} --output {output.clusters_avamb_manual_drep}\ + python {params.path} --cs_d {input.cluster_score_dict_path_avamb} --composition {input.composition}\ + --output {output.clusters_avamb_manual_drep}\ --clusters {input.clusters_aae_z} {input.clusters_aae_y} {input.clusters_vamb}\ --comp {MIN_COMP} --cont {MAX_CONT} --min_bin_size {MIN_BIN_SIZE} """ @@ -533,8 +530,8 @@ checkpoint create_ripped_bins_avamb: shell: """ python {params.path} -r {OUTDIR}/avamb/ --ci {input.path_avamb_manually_drep_clusters}\ - --co {output.path_avamb_manually_drep_clusters_ripped} -l {OUTDIR}/avamb/lengths.npz\ - -n {OUTDIR}/avamb/contignames --bp_d {input.bin_path_dict_path} --br {OUTDIR}/tmp/ripped_bins\ + --co {output.path_avamb_manually_drep_clusters_ripped} -c {OUTDIR}/avamb/composition.npz\ + --bp_d {input.bin_path_dict_path} --br {OUTDIR}/tmp/ripped_bins\ --bin_separator C --log_nc_ripped_bins {output.name_bins_ripped_file} """ # If after run_drep_manual_vamb_z_y rule (dereplication rule), there are no contigs present in more than one bin, diff --git a/workflow_avamb/src/manual_drep_JN.py b/workflow_avamb/src/manual_drep_JN.py index 86df847d..f2730f67 100644 --- a/workflow_avamb/src/manual_drep_JN.py +++ b/workflow_avamb/src/manual_drep_JN.py @@ -17,10 +17,8 @@ def main( # Path to output clusters file. Will error if it already exists outpath: Path, - # Path to names file of contig names. CHANGED - names: Path, - # Path to length.npz of contig length. Output by Vamb - lengths_npz: Path, + # Path to composition.npz + composition_path: Path, # Path to CheckM2 quality_report.tsv file quality_report: dict[str, list], # List of paths to clusters.tsv files as output by Vamb. @@ -36,15 +34,16 @@ def main( bins_extension: str, min_bin_size: int, ) -> None: - # Load contig names - contig_names: list[str] = list(np.loadtxt(names, dtype=object)) + # Load contig names and lengths + comp = vamb.parsecontigs.Composition.load(composition_path) + contig_names: list[str] = list(comp.metadata.identifiers) assert isinstance(contig_names, list) assert isinstance(contig_names[0], str) - # Load lengths - lengths: np.ndarray = vamb.vambtools.read_npz(lengths_npz) + lengths = comp.metadata.lengths assert len(lengths) == len(contig_names) + del comp # free up memory # Load CheckM2 (bin_names, qualities, bin_by_name) = load_checkm2( @@ -276,8 +275,9 @@ def compute_to_remove( if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cs_d", type=str, help="path bins_scores dictionary") - parser.add_argument("--names", type=str, help="Contig names txt file path ") - parser.add_argument("--lengths", type=str, help="Contig lengths npz file path ") + parser.add_argument( + "--composition", type=Path, help="Path to the composition.npz file" + ) parser.add_argument( "--output", type=str, @@ -308,8 +308,7 @@ def compute_to_remove( main( outpath=opt.output, - names=opt.names, - lengths_npz=opt.lengths, + composition_path=opt.composition, quality_report=cluster_scores, binnings=opt.clusters, min_cov=opt.cov, diff --git a/workflow_avamb/src/rip_bins.py b/workflow_avamb/src/rip_bins.py index e4fa705b..89f4aaad 100644 --- a/workflow_avamb/src/rip_bins.py +++ b/workflow_avamb/src/rip_bins.py @@ -1,6 +1,5 @@ from Bio import SeqIO import os -import numpy as np from collections import OrderedDict import argparse import networkx as nx @@ -432,8 +431,7 @@ def find_remaining_clusters_ripped_and_write_ripped_bins( parser.add_argument( "--co", type=str, help="path dereplicated clusters without ripped clusters" ) - parser.add_argument("-l", type=str, help="path to contig lengths") - parser.add_argument("-n", type=str, help="path to contig names") + parser.add_argument("-c", type=str, help="path to composition") parser.add_argument("--bp_d", type=str, help="bin_path dictionary path") parser.add_argument("--br", type=str, help="path ripped bins") parser.add_argument("--bin_separator", type=str, help="path ripped bins") @@ -459,11 +457,10 @@ def find_remaining_clusters_ripped_and_write_ripped_bins( with open(clusters_path) as file: cluster_contigs = vamb.vambtools.read_clusters(file) - contig_lengths_file = opt.l - contig_lengths = cast(Sequence[int], np.load(contig_lengths_file)["arr_0"]) - - contignames_file = opt.n - contig_names = cast(Sequence[str], np.loadtxt(contignames_file, dtype=object)) + comp = vamb.parsecontigs.Composition.load(opt.c) + contig_lengths = cast(Sequence[int], comp.metadata.lengths) + contig_names = cast(Sequence[str], comp.metadata.identifiers) + del comp # free memory path_ripped = cast(str, opt.br)