Skip to content

Commit

Permalink
More consistently rename RPKM to abundance
Browse files Browse the repository at this point in the history
* RPKM is more obscure, technical terminology
* It's technically wrong, since we don't require the abundance to be expressed
  in terms of reads per kilobase per million mapped reads.
* We call the output file `abundance.npz`
* "Abundance" is the standard terminology for this feature in the literature
  • Loading branch information
jakobnissen committed Jul 18, 2024
1 parent c049fbd commit 7bdc15c
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 57 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/cli_vamb.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,30 +37,30 @@ jobs:
pip install -e .
- name: Run VAMB
run: |
vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 -t 10
vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 -t 10
ls -la outdir_vamb
cat outdir_vamb/log.txt
- name: Run TaxVAMB
run: |
vamb bin taxvamb --outdir outdir_taxvamb --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10 -e 10 -q -t 10 -o C --minfasta 200000
vamb bin taxvamb --outdir outdir_taxvamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10 -e 10 -q -t 10 -o C --minfasta 200000
ls -la outdir_taxvamb
cat outdir_taxvamb/log.txt
vamb bin taxvamb --outdir outdir_taxvamb_no_predict --no_predictor --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy taxonomy_mock.tsv -e 10 -q -t 10 -o C --minfasta 200000
vamb bin taxvamb --outdir outdir_taxvamb_no_predict --no_predictor --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -e 10 -q -t 10 -o C --minfasta 200000
ls -la outdir_taxvamb_no_predict
cat outdir_taxvamb_no_predict/log.txt
vamb bin taxvamb --outdir outdir_taxvamb_preds --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --no_predictor --taxonomy outdir_taxvamb/results_taxometer.tsv -e 10 -q -t 10 -o C --minfasta 200000
vamb bin taxvamb --outdir outdir_taxvamb_preds --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --no_predictor --taxonomy outdir_taxvamb/results_taxometer.tsv -e 10 -q -t 10 -o C --minfasta 200000
ls -la outdir_taxvamb_preds
cat outdir_taxvamb_preds/log.txt
- name: Run Taxometer
run: |
vamb taxometer --outdir outdir_taxometer --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10
vamb taxometer --outdir outdir_taxometer --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy taxonomy_mock.tsv -pe 10 -pt 10
ls -la outdir_taxometer
cat outdir_taxometer/log.txt
vamb taxometer --outdir outdir_taxometer_pred --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --taxonomy outdir_taxometer/results_taxometer.tsv -pe 10 -pt 10
vamb taxometer --outdir outdir_taxometer_pred --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --taxonomy outdir_taxometer/results_taxometer.tsv -pe 10 -pt 10
ls -la outdir_taxometer_pred
cat outdir_taxometer/log.txt
- name: Run k-means reclustering
run: |
vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --rpkm abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npy --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --hmmout_path markers_mock.hmmout --algorithm kmeans --minfasta 200000
vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npy --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --hmmout_path markers_mock.hmmout --algorithm kmeans --minfasta 200000
ls -la outdir_recluster
cat outdir_recluster/log.txt
2 changes: 1 addition & 1 deletion doc/how_to_run.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ or
When parsed, Vamb will produce the file `abundance.npz`, which can be used for future
Vamb runs instead:
```
--rpkm abundance.npz
--abundance abundance.npz
```

__Note:__ Vamb will check that the sequence names in the TSV / BAM files correspond
Expand Down
57 changes: 28 additions & 29 deletions vamb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def __init__(
assert isinstance(min_alignment_id, (float, type(None)))
assert isinstance(refcheck, bool)

# Make sure only one RPKM input is there
# Make sure only one abundance input is there
if (
(bampaths is not None)
+ (abundancepath is not None)
Expand Down Expand Up @@ -189,7 +189,7 @@ def __init__(
if min_alignment_id is not None:
if bampaths is None:
raise argparse.ArgumentTypeError(
"If minid is set, RPKM must be passed as bam files"
"If minid is set, abundances must be passed as bam files"
)
if (
not isfinite(min_alignment_id)
Expand Down Expand Up @@ -609,7 +609,7 @@ def calc_tnf(
return composition


def calc_rpkm(
def calc_abundance(
abundance_options: AbundanceOptions,
outdir: Path,
comp_metadata: vamb.parsecontigs.CompositionMetaData,
Expand Down Expand Up @@ -670,7 +670,7 @@ def calc_rpkm(
assert False

elapsed = round(time.time() - begintime, 2)
logger.info(f"\tProcessed RPKM in {elapsed} seconds.\n")
logger.info(f"\tProcessed abundance in {elapsed} seconds.\n")

return abundance

Expand Down Expand Up @@ -903,7 +903,7 @@ def load_composition_and_abundance(
binsplitter: vamb.vambtools.BinSplitter,
) -> Tuple[vamb.parsecontigs.Composition, vamb.parsebam.Abundance]:
composition = calc_tnf(comp_options, vamb_options.out_dir, binsplitter)
abundance = calc_rpkm(
abundance = calc_abundance(
abundance_options,
vamb_options.out_dir,
composition.metadata,
Expand Down Expand Up @@ -1015,7 +1015,7 @@ def run(


def predict_taxonomy(
rpkms: np.ndarray,
abundance: np.ndarray,
tnfs: np.ndarray,
lengths: np.ndarray,
contignames: np.ndarray,
Expand All @@ -1042,7 +1042,7 @@ def predict_taxonomy(
targets = np.array([ind_nodes[i] for i in classes_order])

model = vamb.taxvamb_encode.VAMB2Label(
rpkms.shape[1],
abundance.shape[1],
len(nodes),
nodes,
table_parent,
Expand All @@ -1052,13 +1052,13 @@ def predict_taxonomy(
)

dataloader_vamb = vamb.encode.make_dataloader(
rpkms,
abundance,
tnfs,
lengths,
batchsize=predictor_training_options.batchsize,
)
dataloader_joint = vamb.taxvamb_encode.make_dataloader_concat_hloss(
rpkms,
abundance,
tnfs,
lengths,
targets,
Expand Down Expand Up @@ -1137,14 +1137,14 @@ def run_taxonomy_predictor(
abundance_options=abundance_options,
binsplitter=vamb.vambtools.BinSplitter.inert_splitter(),
)
rpkms, tnfs, lengths, contignames = (
abundance, tnfs, lengths, contignames = (
abundance.matrix,
composition.matrix,
composition.metadata.lengths,
composition.metadata.identifiers,
)
predict_taxonomy(
rpkms=rpkms,
abundance=abundance,
tnfs=tnfs,
lengths=lengths,
contignames=contignames,
Expand Down Expand Up @@ -1172,7 +1172,7 @@ def run_vaevae(
abundance_options=abundance_options,
binsplitter=vamb.vambtools.BinSplitter.inert_splitter(),
)
rpkms, tnfs, lengths, contignames = (
abundance, tnfs, lengths, contignames = (
abundance.matrix,
composition.matrix,
composition.metadata.lengths,
Expand All @@ -1182,7 +1182,7 @@ def run_vaevae(
if taxonomy_options.taxonomy_path is not None and not taxonomy_options.no_predictor:
logger.info("Predicting missing values from taxonomy")
predict_taxonomy(
rpkms=rpkms,
abundance=abundance,
tnfs=tnfs,
lengths=lengths,
contignames=contignames,
Expand Down Expand Up @@ -1224,7 +1224,7 @@ def run_vaevae(

assert vae_options is not None
vae = vamb.taxvamb_encode.VAEVAEHLoss(
rpkms.shape[1],
abundance.shape[1],
len(nodes),
nodes,
table_parent,
Expand All @@ -1237,14 +1237,14 @@ def run_vaevae(
)

dataloader_vamb = vamb.encode.make_dataloader(
rpkms,
abundance,
tnfs,
lengths,
batchsize=vae_training_options.batchsize,
cuda=vamb_options.cuda,
)
dataloader_joint = vamb.taxvamb_encode.make_dataloader_concat_hloss(
rpkms,
abundance,
tnfs,
lengths,
targets,
Expand All @@ -1254,7 +1254,7 @@ def run_vaevae(
cuda=vamb_options.cuda,
)
dataloader_labels = vamb.taxvamb_encode.make_dataloader_labels_hloss(
rpkms,
abundance,
tnfs,
lengths,
targets,
Expand All @@ -1264,7 +1264,7 @@ def run_vaevae(
cuda=vamb_options.cuda,
)

shapes = (rpkms.shape[1], 103, 1, len(nodes))
shapes = (abundance.shape[1], 103, 1, len(nodes))
dataloader = vamb.taxvamb_encode.make_dataloader_semisupervised_hloss(
dataloader_joint,
dataloader_vamb,
Expand Down Expand Up @@ -1595,7 +1595,7 @@ def run_inner(self):


def add_input_output_arguments(subparser):
"""Add TNFs and RPKMs arguments to a subparser."""
"""Add TNFs and abundance arguments to a subparser."""
# TNF arguments
tnfos = subparser.add_argument_group(
title="TNF input (either fasta or all .npz files required)"
Expand All @@ -1605,38 +1605,38 @@ def add_input_output_arguments(subparser):
"--composition", metavar="", type=Path, help="path to .npz of composition"
)

# RPKM arguments
rpkmos = subparser.add_argument_group(
title="RPKM input (exactly one input required)"
# Abundance arguments
abundanceos = subparser.add_argument_group(
title="Abundance input (exactly one input required)"
)
# Note: This argument is deprecated, but we'll keep supporting it for now.
# Instead, use --bamdir.
rpkmos.add_argument(
abundanceos.add_argument(
"--bamfiles",
dest="bampaths",
metavar="",
type=Path,
help=argparse.SUPPRESS,
nargs="+",
)
rpkmos.add_argument(
abundanceos.add_argument(
"--bamdir",
metavar="",
type=Path,
help="Dir with .bam files to use",
)
rpkmos.add_argument(
abundanceos.add_argument(
"--aemb",
metavar="",
type=Path,
help="Dir with .tsv files from strobealign --aemb",
)
rpkmos.add_argument(
"--rpkm",
abundanceos.add_argument(
"--abundance",
metavar="",
dest="abundancepath",
type=Path,
help="path to .npz of RPKM abundances",
help="path to .npz of abundances",
)

reqos = subparser.add_argument_group(title="Output (required)", description=None)
Expand Down Expand Up @@ -2048,7 +2048,6 @@ def main():
prog="vamb",
description=doc,
formatter_class=argparse.RawDescriptionHelpFormatter,
# usage="%(prog)s outdir tnf_input rpkm_input [options]",
add_help=False,
)

Expand Down
38 changes: 19 additions & 19 deletions vamb/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,64 +50,64 @@ def set_batchsize(


def make_dataloader(
rpkm: _np.ndarray,
abundance: _np.ndarray,
tnf: _np.ndarray,
lengths: _np.ndarray,
batchsize: int = 256,
destroy: bool = False,
cuda: bool = False,
) -> _DataLoader:
"""Create a DataLoader from RPKM, TNF and lengths.
"""Create a DataLoader from abundance, TNF and lengths.
The dataloader is an object feeding minibatches of contigs to the VAE.
The data are normalized versions of the input datasets.
Inputs:
rpkm: RPKM matrix (N_contigs x N_samples)
abundance: Abundance matrix (N_contigs x N_samples)
tnf: TNF matrix (N_contigs x N_TNF)
lengths: Numpy array of sequence length (N_contigs)
batchsize: Starting size of minibatches for dataloader
destroy: Mutate rpkm and tnf array in-place instead of making a copy.
destroy: Mutate abundance and tnf array in-place instead of making a copy.
cuda: Pagelock memory of dataloader (use when using GPU acceleration)
Outputs:
DataLoader: An object feeding data to the VAE
"""

if not isinstance(rpkm, _np.ndarray) or not isinstance(tnf, _np.ndarray):
raise ValueError("TNF and RPKM must be Numpy arrays")
if not isinstance(abundance, _np.ndarray) or not isinstance(tnf, _np.ndarray):
raise ValueError("TNF and abundance must be Numpy arrays")

if batchsize < 1:
raise ValueError(f"Batch size must be minimum 1, not {batchsize}")

if len(rpkm) != len(tnf) or len(tnf) != len(lengths):
raise ValueError("Lengths of RPKM, TNF and lengths arrays must be the same")
if len(abundance) != len(tnf) or len(tnf) != len(lengths):
raise ValueError("Lengths of abundance, TNF and lengths arrays must be the same")

if not (rpkm.dtype == tnf.dtype == _np.float32):
raise ValueError("TNF and RPKM must be Numpy arrays of dtype float32")
if not (abundance.dtype == tnf.dtype == _np.float32):
raise ValueError("TNF and abundance must be Numpy arrays of dtype float32")

# Copy if not destroy - this way we can have all following operations in-place
# for simplicity
if not destroy:
rpkm = rpkm.copy()
abundance = abundance.copy()
tnf = tnf.copy()

# Normalize samples to have same depth
sample_depths_sum = rpkm.sum(axis=0)
sample_depths_sum = abundance.sum(axis=0)
if _np.any(sample_depths_sum == 0):
raise ValueError(
"One or more samples have zero depth in all sequences, so cannot be depth normalized"
)
rpkm *= 1_000_000 / sample_depths_sum
total_abundance = rpkm.sum(axis=1)
abundance *= 1_000_000 / sample_depths_sum
total_abundance = abundance.sum(axis=1)

# Normalize rpkm to sum to 1
n_samples = rpkm.shape[1]
# Normalize abundance to sum to 1
n_samples = abundance.shape[1]
zero_total_abundance = total_abundance == 0
rpkm[zero_total_abundance] = 1 / n_samples
abundance[zero_total_abundance] = 1 / n_samples
nonzero_total_abundance = total_abundance.copy()
nonzero_total_abundance[zero_total_abundance] = 1.0
rpkm /= nonzero_total_abundance.reshape((-1, 1))
abundance /= nonzero_total_abundance.reshape((-1, 1))

# Normalize TNF and total abundance to make SSE loss work better
total_abundance = _np.log(total_abundance.clip(min=0.001))
Expand All @@ -123,7 +123,7 @@ def make_dataloader(
weights.shape = (len(weights), 1)

### Create final tensors and dataloader ###
depthstensor = _torch.from_numpy(rpkm) # this is a no-copy operation
depthstensor = _torch.from_numpy(abundance) # this is a no-copy operation
tnftensor = _torch.from_numpy(tnf)
total_abundance_tensor = _torch.from_numpy(total_abundance)
weightstensor = _torch.from_numpy(weights)
Expand Down
2 changes: 1 addition & 1 deletion workflow_avamb/avamb.snake.conda.smk
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ rule run_avamb:
rm -f {OUTDIR}/contigs.flt.mmi
rm -rf {output.outdir_avamb}
{AVAMB_PRELOAD}
vamb --outdir {output.outdir_avamb} --fasta {input.contigs} -p {threads} --rpkm {input.abundance} -m {MIN_CONTIG_SIZE} --minfasta {MIN_BIN_SIZE} {params.cuda} {AVAMB_PARAMS}
vamb --outdir {output.outdir_avamb} --fasta {input.contigs} -p {threads} --abundance {input.abundance} -m {MIN_CONTIG_SIZE} --minfasta {MIN_BIN_SIZE} {params.cuda} {AVAMB_PARAMS}
mkdir -p {OUTDIR}/Final_bins
mkdir -p {OUTDIR}/tmp/checkm2_all
mkdir -p {OUTDIR}/tmp/ripped_bins
Expand Down

0 comments on commit 7bdc15c

Please sign in to comment.