More consistently rename RPKM to abundance

* RPKM is more obscure, technical terminology * It's technically wrong, since we don't require the abundance to be expressed in terms of reads per kilobase per million mapped reads. * We call the output file `abundance.npz` * "Abundance" is the standard terminology for this feature in the literature
RasmussenLab · Jul 18, 2024 · 7bdc15c · 7bdc15c
1 parent c049fbd
commit 7bdc15c
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 57 deletions.
diff --git a/.github/workflows/cli_vamb.yml b/.github/workflows/cli_vamb.yml
@@ -37,30 +37,30 @@ jobs:
         pip install -e .
     - name: Run VAMB
       run: |
-        vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 -t 10
+        vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 -t 10
         ls -la outdir_vamb
         cat outdir_vamb/log.txt
     - name: Run TaxVAMB
       run: |
-        vamb bin taxvamb --outdir outdir_taxvamb --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10 -e 10 -q -t 10 -o C --minfasta 200000
+        vamb bin taxvamb --outdir outdir_taxvamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10 -e 10 -q -t 10 -o C --minfasta 200000
         ls -la outdir_taxvamb
         cat outdir_taxvamb/log.txt
-        vamb bin taxvamb --outdir outdir_taxvamb_no_predict --no_predictor --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy taxonomy_mock.tsv -e 10 -q -t 10 -o C --minfasta 200000
+        vamb bin taxvamb --outdir outdir_taxvamb_no_predict --no_predictor --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -e 10 -q -t 10 -o C --minfasta 200000
         ls -la outdir_taxvamb_no_predict
         cat outdir_taxvamb_no_predict/log.txt
-        vamb bin taxvamb --outdir outdir_taxvamb_preds --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz  --no_predictor --taxonomy outdir_taxvamb/results_taxometer.tsv -e 10 -q -t 10 -o C --minfasta 200000
+        vamb bin taxvamb --outdir outdir_taxvamb_preds --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz  --no_predictor --taxonomy outdir_taxvamb/results_taxometer.tsv -e 10 -q -t 10 -o C --minfasta 200000
         ls -la outdir_taxvamb_preds
         cat outdir_taxvamb_preds/log.txt
     - name: Run Taxometer
       run: |
-        vamb taxometer --outdir outdir_taxometer --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10
+        vamb taxometer --outdir outdir_taxometer --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10
         ls -la outdir_taxometer
         cat outdir_taxometer/log.txt
-        vamb taxometer --outdir outdir_taxometer_pred --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy outdir_taxometer/results_taxometer.tsv -pe 10 -pt 10
+        vamb taxometer --outdir outdir_taxometer_pred --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy outdir_taxometer/results_taxometer.tsv -pe 10 -pt 10
         ls -la outdir_taxometer_pred
         cat outdir_taxometer/log.txt
     - name: Run k-means reclustering
       run: |
-        vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npy --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --hmmout_path markers_mock.hmmout --algorithm kmeans --minfasta 200000
+        vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npy --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --hmmout_path markers_mock.hmmout --algorithm kmeans --minfasta 200000
         ls -la outdir_recluster
         cat outdir_recluster/log.txt
diff --git a/doc/how_to_run.md b/doc/how_to_run.md
@@ -75,7 +75,7 @@ or
 When parsed, Vamb will produce the file `abundance.npz`, which can be used for future
 Vamb runs instead:
 ```
---rpkm abundance.npz
+--abundance abundance.npz
 ```
 
 __Note:__ Vamb will check that the sequence names in the TSV / BAM files correspond

diff --git a/vamb/__main__.py b/vamb/__main__.py
@@ -134,7 +134,7 @@ def __init__(
         assert isinstance(min_alignment_id, (float, type(None)))
         assert isinstance(refcheck, bool)
 
-        # Make sure only one RPKM input is there
+        # Make sure only one abundance input is there
         if (
             (bampaths is not None)
             + (abundancepath is not None)
@@ -189,7 +189,7 @@ def __init__(
         if min_alignment_id is not None:
             if bampaths is None:
                 raise argparse.ArgumentTypeError(
-                    "If minid is set, RPKM must be passed as bam files"
+                    "If minid is set, abundances must be passed as bam files"
                 )
             if (
                 not isfinite(min_alignment_id)
@@ -609,7 +609,7 @@ def calc_tnf(
     return composition
 
 
-def calc_rpkm(
+def calc_abundance(
     abundance_options: AbundanceOptions,
     outdir: Path,
     comp_metadata: vamb.parsecontigs.CompositionMetaData,
@@ -670,7 +670,7 @@ def calc_rpkm(
         assert False
 
     elapsed = round(time.time() - begintime, 2)
-    logger.info(f"\tProcessed RPKM in {elapsed} seconds.\n")
+    logger.info(f"\tProcessed abundance in {elapsed} seconds.\n")
 
     return abundance
 
@@ -903,7 +903,7 @@ def load_composition_and_abundance(
     binsplitter: vamb.vambtools.BinSplitter,
 ) -> Tuple[vamb.parsecontigs.Composition, vamb.parsebam.Abundance]:
     composition = calc_tnf(comp_options, vamb_options.out_dir, binsplitter)
-    abundance = calc_rpkm(
+    abundance = calc_abundance(
         abundance_options,
         vamb_options.out_dir,
         composition.metadata,
@@ -1015,7 +1015,7 @@ def run(
 
 
 def predict_taxonomy(
-    rpkms: np.ndarray,
+    abundance: np.ndarray,
     tnfs: np.ndarray,
     lengths: np.ndarray,
     contignames: np.ndarray,
@@ -1042,7 +1042,7 @@ def predict_taxonomy(
     targets = np.array([ind_nodes[i] for i in classes_order])
 
     model = vamb.taxvamb_encode.VAMB2Label(
-        rpkms.shape[1],
+        abundance.shape[1],
         len(nodes),
         nodes,
         table_parent,
@@ -1052,13 +1052,13 @@ def predict_taxonomy(
     )
 
     dataloader_vamb = vamb.encode.make_dataloader(
-        rpkms,
+        abundance,
         tnfs,
         lengths,
         batchsize=predictor_training_options.batchsize,
     )
     dataloader_joint = vamb.taxvamb_encode.make_dataloader_concat_hloss(
-        rpkms,
+        abundance,
         tnfs,
         lengths,
         targets,
@@ -1137,14 +1137,14 @@ def run_taxonomy_predictor(
         abundance_options=abundance_options,
         binsplitter=vamb.vambtools.BinSplitter.inert_splitter(),
     )
-    rpkms, tnfs, lengths, contignames = (
+    abundance, tnfs, lengths, contignames = (
         abundance.matrix,
         composition.matrix,
         composition.metadata.lengths,
         composition.metadata.identifiers,
     )
     predict_taxonomy(
-        rpkms=rpkms,
+        abundance=abundance,
         tnfs=tnfs,
         lengths=lengths,
         contignames=contignames,
@@ -1172,7 +1172,7 @@ def run_vaevae(
         abundance_options=abundance_options,
         binsplitter=vamb.vambtools.BinSplitter.inert_splitter(),
     )
-    rpkms, tnfs, lengths, contignames = (
+    abundance, tnfs, lengths, contignames = (
         abundance.matrix,
         composition.matrix,
         composition.metadata.lengths,
@@ -1182,7 +1182,7 @@ def run_vaevae(
     if taxonomy_options.taxonomy_path is not None and not taxonomy_options.no_predictor:
         logger.info("Predicting missing values from taxonomy")
         predict_taxonomy(
-            rpkms=rpkms,
+            abundance=abundance,
             tnfs=tnfs,
             lengths=lengths,
             contignames=contignames,
@@ -1224,7 +1224,7 @@ def run_vaevae(
 
     assert vae_options is not None
     vae = vamb.taxvamb_encode.VAEVAEHLoss(
-        rpkms.shape[1],
+        abundance.shape[1],
         len(nodes),
         nodes,
         table_parent,
@@ -1237,14 +1237,14 @@ def run_vaevae(
     )
 
     dataloader_vamb = vamb.encode.make_dataloader(
-        rpkms,
+        abundance,
         tnfs,
         lengths,
         batchsize=vae_training_options.batchsize,
         cuda=vamb_options.cuda,
     )
     dataloader_joint = vamb.taxvamb_encode.make_dataloader_concat_hloss(
-        rpkms,
+        abundance,
         tnfs,
         lengths,
         targets,
@@ -1254,7 +1254,7 @@ def run_vaevae(
         cuda=vamb_options.cuda,
     )
     dataloader_labels = vamb.taxvamb_encode.make_dataloader_labels_hloss(
-        rpkms,
+        abundance,
         tnfs,
         lengths,
         targets,
@@ -1264,7 +1264,7 @@ def run_vaevae(
         cuda=vamb_options.cuda,
     )
 
-    shapes = (rpkms.shape[1], 103, 1, len(nodes))
+    shapes = (abundance.shape[1], 103, 1, len(nodes))
     dataloader = vamb.taxvamb_encode.make_dataloader_semisupervised_hloss(
         dataloader_joint,
         dataloader_vamb,
@@ -1595,7 +1595,7 @@ def run_inner(self):
 
 
 def add_input_output_arguments(subparser):
-    """Add TNFs and RPKMs arguments to a subparser."""
+    """Add TNFs and abundance arguments to a subparser."""
     # TNF arguments
     tnfos = subparser.add_argument_group(
         title="TNF input (either fasta or all .npz files required)"
@@ -1605,38 +1605,38 @@ def add_input_output_arguments(subparser):
         "--composition", metavar="", type=Path, help="path to .npz of composition"
     )
 
-    # RPKM arguments
-    rpkmos = subparser.add_argument_group(
-        title="RPKM input (exactly one input required)"
+    # Abundance arguments
+    abundanceos = subparser.add_argument_group(
+        title="Abundance input (exactly one input required)"
     )
     # Note: This argument is deprecated, but we'll keep supporting it for now.
     # Instead, use --bamdir.
-    rpkmos.add_argument(
+    abundanceos.add_argument(
         "--bamfiles",
         dest="bampaths",
         metavar="",
         type=Path,
         help=argparse.SUPPRESS,
         nargs="+",
     )
-    rpkmos.add_argument(
+    abundanceos.add_argument(
         "--bamdir",
         metavar="",
         type=Path,
         help="Dir with .bam files to use",
     )
-    rpkmos.add_argument(
+    abundanceos.add_argument(
         "--aemb",
         metavar="",
         type=Path,
         help="Dir with .tsv files from strobealign --aemb",
     )
-    rpkmos.add_argument(
-        "--rpkm",
+    abundanceos.add_argument(
+        "--abundance",
         metavar="",
         dest="abundancepath",
         type=Path,
-        help="path to .npz of RPKM abundances",
+        help="path to .npz of abundances",
     )
 
     reqos = subparser.add_argument_group(title="Output (required)", description=None)
@@ -2048,7 +2048,6 @@ def main():
         prog="vamb",
         description=doc,
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        # usage="%(prog)s outdir tnf_input rpkm_input [options]",
         add_help=False,
     )
 

diff --git a/vamb/encode.py b/vamb/encode.py
@@ -50,64 +50,64 @@ def set_batchsize(
 
 
 def make_dataloader(
-    rpkm: _np.ndarray,
+    abundance: _np.ndarray,
     tnf: _np.ndarray,
     lengths: _np.ndarray,
     batchsize: int = 256,
     destroy: bool = False,
     cuda: bool = False,
 ) -> _DataLoader:
-    """Create a DataLoader from RPKM, TNF and lengths.
+    """Create a DataLoader from abundance, TNF and lengths.
 
     The dataloader is an object feeding minibatches of contigs to the VAE.
     The data are normalized versions of the input datasets.
 
     Inputs:
-        rpkm: RPKM matrix (N_contigs x N_samples)
+        abundance: Abundance matrix (N_contigs x N_samples)
         tnf: TNF matrix (N_contigs x N_TNF)
         lengths: Numpy array of sequence length (N_contigs)
         batchsize: Starting size of minibatches for dataloader
-        destroy: Mutate rpkm and tnf array in-place instead of making a copy.
+        destroy: Mutate abundance and tnf array in-place instead of making a copy.
         cuda: Pagelock memory of dataloader (use when using GPU acceleration)
 
     Outputs:
         DataLoader: An object feeding data to the VAE
     """
 
-    if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray):
-        raise ValueError("TNF and RPKM must be Numpy arrays")
+    if not isinstance(abundance, _np.ndarray) or not isinstance(tnf, _np.ndarray):
+        raise ValueError("TNF and abundance must be Numpy arrays")
 
     if batchsize < 1:
         raise ValueError(f"Batch size must be minimum 1, not {batchsize}")
 
-    if len(rpkm) != len(tnf) or len(tnf) != len(lengths):
-        raise ValueError("Lengths of RPKM, TNF and lengths arrays must be the same")
+    if len(abundance) != len(tnf) or len(tnf) != len(lengths):
+        raise ValueError("Lengths of abundance, TNF and lengths arrays must be the same")
 
-    if not (rpkm.dtype == tnf.dtype == _np.float32):
-        raise ValueError("TNF and RPKM must be Numpy arrays of dtype float32")
+    if not (abundance.dtype == tnf.dtype == _np.float32):
+        raise ValueError("TNF and abundance must be Numpy arrays of dtype float32")
 
     # Copy if not destroy - this way we can have all following operations in-place
     # for simplicity
     if not destroy:
-        rpkm = rpkm.copy()
+        abundance = abundance.copy()
         tnf = tnf.copy()
 
     # Normalize samples to have same depth
-    sample_depths_sum = rpkm.sum(axis=0)
+    sample_depths_sum = abundance.sum(axis=0)
     if _np.any(sample_depths_sum == 0):
         raise ValueError(
             "One or more samples have zero depth in all sequences, so cannot be depth normalized"
         )
-    rpkm *= 1_000_000 / sample_depths_sum
-    total_abundance = rpkm.sum(axis=1)
+    abundance *= 1_000_000 / sample_depths_sum
+    total_abundance = abundance.sum(axis=1)
 
-    # Normalize rpkm to sum to 1
-    n_samples = rpkm.shape[1]
+    # Normalize abundance to sum to 1
+    n_samples = abundance.shape[1]
     zero_total_abundance = total_abundance == 0
-    rpkm[zero_total_abundance] = 1 / n_samples
+    abundance[zero_total_abundance] = 1 / n_samples
     nonzero_total_abundance = total_abundance.copy()
     nonzero_total_abundance[zero_total_abundance] = 1.0
-    rpkm /= nonzero_total_abundance.reshape((-1, 1))
+    abundance /= nonzero_total_abundance.reshape((-1, 1))
 
     # Normalize TNF and total abundance to make SSE loss work better
     total_abundance = _np.log(total_abundance.clip(min=0.001))
@@ -123,7 +123,7 @@ def make_dataloader(
     weights.shape = (len(weights), 1)
 
     ### Create final tensors and dataloader ###
-    depthstensor = _torch.from_numpy(rpkm)  # this is a no-copy operation
+    depthstensor = _torch.from_numpy(abundance)  # this is a no-copy operation
     tnftensor = _torch.from_numpy(tnf)
     total_abundance_tensor = _torch.from_numpy(total_abundance)
     weightstensor = _torch.from_numpy(weights)

diff --git a/workflow_avamb/avamb.snake.conda.smk b/workflow_avamb/avamb.snake.conda.smk
@@ -357,7 +357,7 @@ rule run_avamb:
         rm -f {OUTDIR}/contigs.flt.mmi
         rm -rf {output.outdir_avamb} 
         {AVAMB_PRELOAD}
-        vamb --outdir {output.outdir_avamb} --fasta {input.contigs} -p {threads} --rpkm {input.abundance} -m {MIN_CONTIG_SIZE} --minfasta {MIN_BIN_SIZE}  {params.cuda}  {AVAMB_PARAMS}
+        vamb --outdir {output.outdir_avamb} --fasta {input.contigs} -p {threads} --abundance {input.abundance} -m {MIN_CONTIG_SIZE} --minfasta {MIN_BIN_SIZE}  {params.cuda}  {AVAMB_PARAMS}
         mkdir -p {OUTDIR}/Final_bins
         mkdir -p {OUTDIR}/tmp/checkm2_all
         mkdir -p {OUTDIR}/tmp/ripped_bins