Skip to content

Commit

Permalink
Fix: Update src/create_fasta.py
Browse files Browse the repository at this point in the history
In 15638ff, binsplitting was refactored. This change removed the minsize keyword
from vambtools.write_bins, which src/create_fasta.py relies on, breaking the
script.
Fix this, by making the following changes:
* Read in the FASTA in two passes, one to figure out which sequences to store in
  memory, and one which stored them
* Filter the bins in the script before calling write_bins
* Change the default max bins from the overly conservative 250 to 1000.
  • Loading branch information
jakobnissen committed Mar 8, 2024
1 parent 491590c commit 8a9d061
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 6 deletions.
20 changes: 16 additions & 4 deletions src/create_fasta.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import argparse
import vamb
import pathlib

parser = argparse.ArgumentParser(
description="""Command-line bin creator.
Expand All @@ -11,7 +12,7 @@

parser.add_argument("fastapath", help="Path to FASTA file")
parser.add_argument("clusterspath", help="Path to clusters.tsv")
parser.add_argument("minsize", help="Minimum size of bin", type=int, default=0)
parser.add_argument("minsize", help="Minimum size of bin in bp", type=int, default=0)
parser.add_argument("outdir", help="Directory to create")

if len(sys.argv) == 1:
Expand All @@ -20,10 +21,21 @@

args = parser.parse_args()

# Read in FASTA files only to get its length. This way, we can avoid storing
# in memory contigs for sequences that will never get output anyway
lens: dict[str, int] = dict()
with vamb.vambtools.Reader(args.fastapath) as file:
for record in vamb.vambtools.byte_iterfasta(file):
lens[record.identifier] = len(record)

with open(args.clusterspath) as file:
clusters = vamb.vambtools.read_clusters(file)

clusters = {
cluster: contigs
for (cluster, contigs) in clusters.items()
if sum(lens[c] for c in contigs) >= args.minsize
}

with vamb.vambtools.Reader(args.fastapath) as file:
vamb.vambtools.write_bins(
args.outdir, clusters, file, maxbins=None, minsize=args.minsize
)
vamb.vambtools.write_bins(pathlib.Path(args.outdir), clusters, file, maxbins=None)
4 changes: 2 additions & 2 deletions vamb/vambtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,15 +617,15 @@ def write_bins(
directory: Path,
bins: dict[str, set[str]],
fastaio: Iterable[bytes],
maxbins: Optional[int] = 250,
maxbins: Optional[int] = 1000,
):
"""Writes bins as FASTA files in a directory, one file per bin.
Inputs:
directory: Directory to create or put files in
bins: dict[str: set[str]] (can be loaded from clusters.tsv using vamb.cluster.read_clusters)
fastaio: bytes iterator containing FASTA file with all sequences
maxbins: None or else raise an error if trying to make more bins than this [250]
maxbins: None or else raise an error if trying to make more bins than this [1000]
Output: None
"""

Expand Down

0 comments on commit 8a9d061

Please sign in to comment.