Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not make files lengths.npz and contignames #307

Merged
merged 1 commit into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions vamb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,14 +919,6 @@ def run(
comp_metadata = composition.metadata
del composition, abundance

# Write contignames and contiglengths needed for dereplication purposes
np.savetxt(
vamb_options.out_dir.joinpath("contignames"),
comp_metadata.identifiers,
fmt="%s",
)
np.savez(vamb_options.out_dir.joinpath("lengths.npz"), comp_metadata.lengths)

if vae_options is not None:
assert latent is not None
assert comp_metadata.nseqs == len(latent)
Expand Down
9 changes: 1 addition & 8 deletions vamb/parsecontigs.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
__doc__ = """Calculate tetranucleotide frequency from a FASTA file.

Usage:
>>> with open('/path/to/contigs.fna') as filehandle
... tnfs, contignames, lengths = read_contigs(filehandle)
"""

import os as _os
import numpy as _np
import vamb.vambtools as _vambtools
Expand Down Expand Up @@ -121,7 +114,7 @@ def save(self, io: Union[str, Path, IO[bytes]]):
)

@classmethod
def load(cls, io: Union[str, IO[bytes]]):
def load(cls, io: Union[str, IO[bytes], Path]):
arrs = _np.load(io, allow_pickle=True)
metadata = CompositionMetaData(
_vambtools.validate_input_array(arrs["identifiers"]),
Expand Down
2 changes: 0 additions & 2 deletions workflow_avamb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,6 @@ Avamb produces the following output files:
- `aae_z_clusters.tsv`: file generated by clustering the AAE z latent space, where each row is a sequence: Left column for the cluster (i.e bin) name, right column for the sequence name. You can create the FASTA-file bins themselves using the script in `src/create_fasta.py`
- `aae_z_latent.npz`: this contains the output of the AAE model z latent space.
- `composition.npz`: a Numpy .npz file that contain all kmer composition information computed by Avamb from the FASTA file. This can be provided to another run of Avamb to skip the composition calculation step.
- `contignames`: text file containing a list of the contigs remaining after the minimum contig size allowed, and defined on the `min_contig_size` in the `config.json` file.
- `lengths.npz`: Numpy object that contains the contig length, same order than the contignames.
- `log.txt`: a text file with information about the Avamb run. Look here (and at stderr) if you experience errors.
- `model.pt`: a file containing the trained VAE model. When running Avamb from a Python interpreter, the VAE can be loaded from this file to skip training.
- `aae_model.pt`: a file containing the trained AAE model. When running Avamb from a Python interpreter, the AAE can be loaded from this file to skip training.
Expand Down
13 changes: 5 additions & 8 deletions workflow_avamb/avamb.snake.conda.smk
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,6 @@ rule run_avamb:
clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"),
clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"),
clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv"),
contignames=os.path.join(OUTDIR,"avamb/contignames"),
contiglenghts=os.path.join(OUTDIR,"avamb/lengths.npz")
params:
walltime="86400",
nodes="1",
Expand Down Expand Up @@ -471,8 +469,7 @@ rule create_cluster_scores_bin_path_dictionaries:
rule run_drep_manual_vamb_z_y:
input:
cluster_score_dict_path_avamb=os.path.join(OUTDIR,"tmp/cs_d_avamb.json"),
contignames=os.path.join(OUTDIR,"avamb/contignames"),
contiglengths=os.path.join(OUTDIR,"avamb/lengths.npz"),
composition=os.path.join(OUTDIR,"avamb/composition.npz"),
clusters_aae_z=os.path.join(OUTDIR,"avamb/aae_z_clusters_split.tsv"),
clusters_aae_y=os.path.join(OUTDIR,"avamb/aae_y_clusters_split.tsv"),
clusters_vamb=os.path.join(OUTDIR,"avamb/vae_clusters_split.tsv")
Expand All @@ -496,8 +493,8 @@ rule run_drep_manual_vamb_z_y:

shell:
"""
python {params.path} --cs_d {input.cluster_score_dict_path_avamb} --names {input.contignames}\
--lengths {input.contiglengths} --output {output.clusters_avamb_manual_drep}\
python {params.path} --cs_d {input.cluster_score_dict_path_avamb} --composition {input.composition}\
--output {output.clusters_avamb_manual_drep}\
--clusters {input.clusters_aae_z} {input.clusters_aae_y} {input.clusters_vamb}\
--comp {MIN_COMP} --cont {MAX_CONT} --min_bin_size {MIN_BIN_SIZE}
"""
Expand Down Expand Up @@ -533,8 +530,8 @@ checkpoint create_ripped_bins_avamb:
shell:
"""
python {params.path} -r {OUTDIR}/avamb/ --ci {input.path_avamb_manually_drep_clusters}\
--co {output.path_avamb_manually_drep_clusters_ripped} -l {OUTDIR}/avamb/lengths.npz\
-n {OUTDIR}/avamb/contignames --bp_d {input.bin_path_dict_path} --br {OUTDIR}/tmp/ripped_bins\
--co {output.path_avamb_manually_drep_clusters_ripped} -c {OUTDIR}/avamb/composition.npz\
--bp_d {input.bin_path_dict_path} --br {OUTDIR}/tmp/ripped_bins\
--bin_separator C --log_nc_ripped_bins {output.name_bins_ripped_file}
"""
# If after run_drep_manual_vamb_z_y rule (dereplication rule), there are no contigs present in more than one bin,
Expand Down
23 changes: 11 additions & 12 deletions workflow_avamb/src/manual_drep_JN.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@
def main(
# Path to output clusters file. Will error if it already exists
outpath: Path,
# Path to names file of contig names. CHANGED
names: Path,
# Path to length.npz of contig length. Output by Vamb
lengths_npz: Path,
# Path to composition.npz
composition_path: Path,
# Path to CheckM2 quality_report.tsv file
quality_report: dict[str, list],
# List of paths to clusters.tsv files as output by Vamb.
Expand All @@ -36,15 +34,16 @@ def main(
bins_extension: str,
min_bin_size: int,
) -> None:
# Load contig names
contig_names: list[str] = list(np.loadtxt(names, dtype=object))
# Load contig names and lengths
comp = vamb.parsecontigs.Composition.load(composition_path)

contig_names: list[str] = list(comp.metadata.identifiers)
assert isinstance(contig_names, list)
assert isinstance(contig_names[0], str)

# Load lengths
lengths: np.ndarray = vamb.vambtools.read_npz(lengths_npz)
lengths = comp.metadata.lengths
assert len(lengths) == len(contig_names)
del comp # free up memory

# Load CheckM2
(bin_names, qualities, bin_by_name) = load_checkm2(
Expand Down Expand Up @@ -276,8 +275,9 @@ def compute_to_remove(
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--cs_d", type=str, help="path bins_scores dictionary")
parser.add_argument("--names", type=str, help="Contig names txt file path ")
parser.add_argument("--lengths", type=str, help="Contig lengths npz file path ")
parser.add_argument(
"--composition", type=Path, help="Path to the composition.npz file"
)
parser.add_argument(
"--output",
type=str,
Expand Down Expand Up @@ -308,8 +308,7 @@ def compute_to_remove(

main(
outpath=opt.output,
names=opt.names,
lengths_npz=opt.lengths,
composition_path=opt.composition,
quality_report=cluster_scores,
binnings=opt.clusters,
min_cov=opt.cov,
Expand Down
13 changes: 5 additions & 8 deletions workflow_avamb/src/rip_bins.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from Bio import SeqIO
import os
import numpy as np
from collections import OrderedDict
import argparse
import networkx as nx
Expand Down Expand Up @@ -432,8 +431,7 @@ def find_remaining_clusters_ripped_and_write_ripped_bins(
parser.add_argument(
"--co", type=str, help="path dereplicated clusters without ripped clusters"
)
parser.add_argument("-l", type=str, help="path to contig lengths")
parser.add_argument("-n", type=str, help="path to contig names")
parser.add_argument("-c", type=str, help="path to composition")
parser.add_argument("--bp_d", type=str, help="bin_path dictionary path")
parser.add_argument("--br", type=str, help="path ripped bins")
parser.add_argument("--bin_separator", type=str, help="path ripped bins")
Expand All @@ -459,11 +457,10 @@ def find_remaining_clusters_ripped_and_write_ripped_bins(
with open(clusters_path) as file:
cluster_contigs = vamb.vambtools.read_clusters(file)

contig_lengths_file = opt.l
contig_lengths = cast(Sequence[int], np.load(contig_lengths_file)["arr_0"])

contignames_file = opt.n
contig_names = cast(Sequence[str], np.loadtxt(contignames_file, dtype=object))
comp = vamb.parsecontigs.Composition.load(opt.c)
contig_lengths = cast(Sequence[int], comp.metadata.lengths)
contig_names = cast(Sequence[str], comp.metadata.identifiers)
del comp # free memory

path_ripped = cast(str, opt.br)

Expand Down
Loading