RasmussenLab · jakobnissen · Jul 4, 2024 · Apr 22, 2024
diff --git a/vamb/__main__.py b/vamb/__main__.py
@@ -919,14 +919,6 @@ def run(
     comp_metadata = composition.metadata
     del composition, abundance
 
-    # Write contignames and contiglengths needed for dereplication purposes
-    np.savetxt(
-        vamb_options.out_dir.joinpath("contignames"),
-        comp_metadata.identifiers,
-        fmt="%s",
-    )
-    np.savez(vamb_options.out_dir.joinpath("lengths.npz"), comp_metadata.lengths)
-
     if vae_options is not None:
         assert latent is not None
         assert comp_metadata.nseqs == len(latent)

diff --git a/vamb/parsecontigs.py b/vamb/parsecontigs.py
@@ -1,10 +1,3 @@
-__doc__ = """Calculate tetranucleotide frequency from a FASTA file.
-
-Usage:
->>> with open('/path/to/contigs.fna') as filehandle
-...     tnfs, contignames, lengths = read_contigs(filehandle)
-"""
-
 import os as _os
 import numpy as _np
 import vamb.vambtools as _vambtools
@@ -121,7 +114,7 @@ def save(self, io: Union[str, Path, IO[bytes]]):
         )
 
     @classmethod
-    def load(cls, io: Union[str, IO[bytes]]):
+    def load(cls, io: Union[str, IO[bytes], Path]):
         arrs = _np.load(io, allow_pickle=True)
         metadata = CompositionMetaData(
             _vambtools.validate_input_array(arrs["identifiers"]),

diff --git a/workflow_avamb/README.md b/workflow_avamb/README.md
@@ -133,8 +133,6 @@ Avamb produces the following output files:
    - `aae_z_clusters.tsv`: file generated by clustering the AAE z latent space, where each row is a sequence: Left column for the cluster (i.e bin) name, right column for the sequence name. You can create the FASTA-file bins themselves using the script in `src/create_fasta.py`
    - `aae_z_latent.npz`: this contains the output of the AAE model z latent space.
    - `composition.npz`: a Numpy .npz file that contain all kmer composition information computed by Avamb from the FASTA file. This can be provided to another run of Avamb to skip the composition calculation step.
-   - `contignames`: text file containing a list of the contigs remaining after the minimum contig size allowed, and defined on the `min_contig_size` in the `config.json` file.
-   - `lengths.npz`: Numpy object that contains the contig length, same order than the contignames.
    - `log.txt`: a text file with information about the Avamb run. Look here (and at stderr) if you experience errors.
    - `model.pt`: a file containing the trained VAE model. When running Avamb from a Python interpreter, the VAE can be loaded from this file to skip training.
    - `aae_model.pt`: a file containing the trained AAE model. When running Avamb from a Python interpreter, the AAE can be loaded from this file to skip training.   

diff --git a/workflow_avamb/avamb.snake.conda.smk b/workflow_avamb/avamb.snake.conda.smk
@@ -335,8 +335,6 @@ rule run_avamb:
         clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"),
         clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"),
         clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv"),
-        contignames=os.path.join(OUTDIR,"avamb/contignames"),
-        contiglenghts=os.path.join(OUTDIR,"avamb/lengths.npz")
     params:
         walltime="86400",
         nodes="1",
@@ -471,8 +469,7 @@ rule create_cluster_scores_bin_path_dictionaries:
 rule run_drep_manual_vamb_z_y:
     input:
         cluster_score_dict_path_avamb=os.path.join(OUTDIR,"tmp/cs_d_avamb.json"),
-        contignames=os.path.join(OUTDIR,"avamb/contignames"),
-        contiglengths=os.path.join(OUTDIR,"avamb/lengths.npz"),
+        composition=os.path.join(OUTDIR,"avamb/composition.npz"),
         clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"),
         clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"),
         clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv")
@@ -496,8 +493,8 @@ rule run_drep_manual_vamb_z_y:
 
     shell:
         """
-        python {params.path}  --cs_d  {input.cluster_score_dict_path_avamb} --names {input.contignames}\
-        --lengths {input.contiglengths}  --output {output.clusters_avamb_manual_drep}\
+        python {params.path}  --cs_d  {input.cluster_score_dict_path_avamb} --composition {input.composition}\
+        --output {output.clusters_avamb_manual_drep}\
         --clusters {input.clusters_aae_z} {input.clusters_aae_y} {input.clusters_vamb}\
         --comp {MIN_COMP} --cont {MAX_CONT}  --min_bin_size {MIN_BIN_SIZE} 
         """
@@ -533,8 +530,8 @@ checkpoint create_ripped_bins_avamb:
     shell: 
         """
         python {params.path} -r {OUTDIR}/avamb/ --ci {input.path_avamb_manually_drep_clusters}\
-        --co  {output.path_avamb_manually_drep_clusters_ripped}  -l {OUTDIR}/avamb/lengths.npz\
-        -n {OUTDIR}/avamb/contignames --bp_d {input.bin_path_dict_path} --br {OUTDIR}/tmp/ripped_bins\
+        --co  {output.path_avamb_manually_drep_clusters_ripped}  -c {OUTDIR}/avamb/composition.npz\
+        --bp_d {input.bin_path_dict_path} --br {OUTDIR}/tmp/ripped_bins\
         --bin_separator C --log_nc_ripped_bins {output.name_bins_ripped_file} 
         """
 # If after run_drep_manual_vamb_z_y rule (dereplication rule), there are no contigs present in more than one bin,

diff --git a/workflow_avamb/src/manual_drep_JN.py b/workflow_avamb/src/manual_drep_JN.py
@@ -17,10 +17,8 @@
 def main(
     # Path to output clusters file. Will error if it already exists
     outpath: Path,
-    # Path to names file of contig names. CHANGED
-    names: Path,
-    # Path to length.npz of contig length. Output by Vamb
-    lengths_npz: Path,
+    # Path to composition.npz
+    composition_path: Path,
     # Path to CheckM2 quality_report.tsv file
     quality_report: dict[str, list],
     # List of paths to clusters.tsv files as output by Vamb.
@@ -36,15 +34,16 @@ def main(
     bins_extension: str,
     min_bin_size: int,
 ) -> None:
-    # Load contig names
-    contig_names: list[str] = list(np.loadtxt(names, dtype=object))
+    # Load contig names and lengths
+    comp = vamb.parsecontigs.Composition.load(composition_path)
 
+    contig_names: list[str] = list(comp.metadata.identifiers)
     assert isinstance(contig_names, list)
     assert isinstance(contig_names[0], str)
 
-    # Load lengths
-    lengths: np.ndarray = vamb.vambtools.read_npz(lengths_npz)
+    lengths = comp.metadata.lengths
     assert len(lengths) == len(contig_names)
+    del comp  # free up memory
 
     # Load CheckM2
     (bin_names, qualities, bin_by_name) = load_checkm2(
@@ -276,8 +275,9 @@ def compute_to_remove(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--cs_d", type=str, help="path bins_scores dictionary")
-    parser.add_argument("--names", type=str, help="Contig names txt file path ")
-    parser.add_argument("--lengths", type=str, help="Contig lengths npz file path ")
+    parser.add_argument(
+        "--composition", type=Path, help="Path to the composition.npz file"
+    )
     parser.add_argument(
         "--output",
         type=str,
@@ -308,8 +308,7 @@ def compute_to_remove(
 
     main(
         outpath=opt.output,
-        names=opt.names,
-        lengths_npz=opt.lengths,
+        composition_path=opt.composition,
         quality_report=cluster_scores,
         binnings=opt.clusters,
         min_cov=opt.cov,

diff --git a/workflow_avamb/src/rip_bins.py b/workflow_avamb/src/rip_bins.py
@@ -1,6 +1,5 @@
 from Bio import SeqIO
 import os
-import numpy as np
 from collections import OrderedDict
 import argparse
 import networkx as nx
@@ -432,8 +431,7 @@ def find_remaining_clusters_ripped_and_write_ripped_bins(
     parser.add_argument(
         "--co", type=str, help="path dereplicated clusters without ripped clusters"
     )
-    parser.add_argument("-l", type=str, help="path to contig lengths")
-    parser.add_argument("-n", type=str, help="path to contig names")
+    parser.add_argument("-c", type=str, help="path to composition")
     parser.add_argument("--bp_d", type=str, help="bin_path dictionary path")
     parser.add_argument("--br", type=str, help="path ripped bins")
     parser.add_argument("--bin_separator", type=str, help="path ripped bins")
@@ -459,11 +457,10 @@ def find_remaining_clusters_ripped_and_write_ripped_bins(
     with open(clusters_path) as file:
         cluster_contigs = vamb.vambtools.read_clusters(file)
 
-    contig_lengths_file = opt.l
-    contig_lengths = cast(Sequence[int], np.load(contig_lengths_file)["arr_0"])
-
-    contignames_file = opt.n
-    contig_names = cast(Sequence[str], np.loadtxt(contignames_file, dtype=object))
+    comp = vamb.parsecontigs.Composition.load(opt.c)
+    contig_lengths = cast(Sequence[int], comp.metadata.lengths)
+    contig_names = cast(Sequence[str], comp.metadata.identifiers)
+    del comp  # free memory
 
     path_ripped = cast(str, opt.br)