Skip to content

Commit

Permalink
update to v1.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
raufs committed May 17, 2024
1 parent 3260e70 commit c5b9a57
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 56 deletions.
39 changes: 17 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,25 +126,24 @@ skder -h
The help function should return the following:

```
usage: skder [-h] [-g GENOMES [GENOMES ...]] [-t TAXA_NAME] -o OUTPUT_DIRECTORY [-d DEREPLICATION_MODE]
[-i PERCENT_IDENTITY_CUTOFF] [-f ALIGNED_FRACTION_CUTOFF] [-a MAX_AF_DISTANCE_CUTOFF] [-p SKANI_TRIANGLE_PARAMETERS]
[-c CPUS] [-s] [-n] [-l] [-x] [-u] [-v]
usage: skder [-h] [-g GENOMES [GENOMES ...]] [-t TAXA_NAME] [-r GTDB_RELEASE] -o OUTPUT_DIRECTORY [-d DEREPLICATION_MODE] [-i PERCENT_IDENTITY_CUTOFF] [-f ALIGNED_FRACTION_CUTOFF]
[-a MAX_AF_DISTANCE_CUTOFF] [-p SKANI_TRIANGLE_PARAMETERS] [-c CPUS] [-s] [-n] [-l] [-u] [-v]
Program: skder
Author: Rauf Salamzade
Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology
Program: skder
Author: Rauf Salamzade
Affiliation: Kalan Lab, UW Madison, Department of Medical Microbiology and Immunology
skDER: efficient & high-resolution dereplication of microbial genomes to select
representative genomes.
skDER will perform dereplication of genomes using skani average nucleotide identity
(ANI) and aligned fraction (AF) estimates and either a dynamic programming or
greedy-based based approach. It assesses such pairwise ANI & AF estimates to determine
whether two genomes are similar to each other and then chooses which genome is better
suited to serve as a representative based on assembly N50 (favoring the more contiguous
assembly) and connectedness (favoring genomes deemed similar to a greater number of
alternate genomes).
skDER: efficient & high-resolution dereplication of microbial genomes to select
representative genomes.
skDER will perform dereplication of genomes using skani average nucleotide identity
(ANI) and aligned fraction (AF) estimates and either a dynamic programming or
greedy-based based approach. It assesses such pairwise ANI & AF estimates to determine
whether two genomes are similar to each other and then chooses which genome is better
suited to serve as a representative based on assembly N50 (favoring the more contiguous
assembly) and connectedness (favoring genomes deemed similar to a greater number of
alternate genomes).
options:
-h, --help show this help message and exit
Expand All @@ -155,6 +154,8 @@ options:
-t TAXA_NAME, --taxa_name TAXA_NAME
Genus or species identifier from GTDB (currently R214)
for which to download genomes for [Optional].
-r GTDB_RELEASE, --gtdb_release GTDB_RELEASE
Which GTDB release to use if -t argument issued [Default is R220].
-o OUTPUT_DIRECTORY, --output_directory OUTPUT_DIRECTORY
Output directory.
-d DEREPLICATION_MODE, --dereplication_mode DEREPLICATION_MODE
Expand Down Expand Up @@ -183,12 +184,6 @@ options:
genomes to their closest representative genomes.
-l, --symlink Symlink representative genomes in results subdirectory
instead of performing a copy of the files.
-x, --avoid_symlink By default, skDER symlinks input genomes provided by users
into the output directory to avoid creating N50 related index files
in the directory of the original input genomes. If your system doesn't
support symlink - you can use this flag, but it will create index
files for N50 calculations in the same directory as
your genomes.
-u, --ncbi_nlm_url Try using the NCBI ftp address with '.nlm' for
ncbi-genome-download if there are issues.
-v, --version Report version of skDER.
Expand Down
62 changes: 34 additions & 28 deletions bin/skder
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@ import argparse
import gzip
from collections import defaultdict
import pkg_resources
import math
import multiprocessing

version = pkg_resources.require("skDER")[0].version

ACCEPTED_SUFFICES = set(['fasta', 'fas', 'fna', 'fa'])
VALID_GTDB_RELEASES = set(['R214', 'R220'])

def create_parser():
""" Parse arguments """
Expand All @@ -73,7 +75,8 @@ def create_parser():
""", formatter_class=argparse.RawTextHelpFormatter)

parser.add_argument('-g', '--genomes', nargs='+', help='Genome assembly files in (gzipped) FASTA format\n(accepted suffices are: *.fasta,\n*.fa, *.fas, or *.fna) [Optional].', required=False, default=[])
parser.add_argument('-t', '--taxa_name', help='Genus or species identifier from GTDB (currently R214)\nfor which to download genomes for [Optional].', required=False, default=None)
parser.add_argument('-t', '--taxa_name', help='Genus or species identifier from GTDB for which to\ndownload genomes for and include in\ndereplication analysis [Optional].', required=False, default=None)
parser.add_argument('-r', '--gtdb_release', help='Which GTDB release to use if -t argument issued [Default is R220].', default="R220")
parser.add_argument('-o', '--output_directory', help='Output directory.', required=True)
parser.add_argument('-d', '--dereplication_mode', help='Whether to use a "dynamic" (more concise) or "greedy" (more\ncomprehensive) approach to selecting representative genomes.\n[Default is "dynamic"]', required=False, default="dynamic")
parser.add_argument('-i', '--percent_identity_cutoff', type=float, help="ANI cutoff for dereplication [Default is 99.0].", required=False, default=99.0)
Expand All @@ -84,7 +87,6 @@ def create_parser():
parser.add_argument('-s', '--sanity_check', action='store_true', help="Confirm each FASTA file provided or downloaded is actually\na FASTA file. Makes it slower, but generally\ngood practice.", required=False, default=False)
parser.add_argument('-n', '--determine_clusters', action='store_true', help="Perform secondary clustering to assign non-representative\ngenomes to their closest representative genomes.", required=False, default=False)
parser.add_argument('-l', '--symlink', action='store_true', help="Symlink representative genomes in results subdirectory\ninstead of performing a copy of the files.", required=False, default=False)
parser.add_argument('-x', '--avoid_symlink', action='store_true', help="By default, skDER symlinks input genomes provided by users\ninto the output directory to avoid creating N50 related index files\nin the directory of the original input genomes. If your system doesn't\nsupport symlink - you can use this flag, but it will create index\nfiles for N50 calculations in the same directory as\nyour genomes.", required=False, default=False)
parser.add_argument('-u', '--ncbi_nlm_url', action='store_true', help="Try using the NCBI ftp address with '.nlm' for\nncbi-genome-download if there are issues.", required=False, default=False)
parser.add_argument('-v', '--version', action='store_true', help="Report version of skDER.", required=False, default=False)
args = parser.parse_args()
Expand All @@ -102,6 +104,7 @@ def skder_main():
taxa_name = None
if myargs.taxa_name:
taxa_name = myargs.taxa_name.strip('"')
gtdb_release = myargs.gtdb_release.upper()
outdir = os.path.abspath(myargs.output_directory) + '/'
selection_mode = myargs.dereplication_mode.lower()
percent_identity_cutoff = myargs.percent_identity_cutoff
Expand All @@ -112,7 +115,6 @@ def skder_main():
symlink_flag = myargs.symlink
determine_clusters_flag = myargs.determine_clusters
sanity_check = myargs.sanity_check
avoid_symlink_flag = myargs.avoid_symlink
ncbi_nlm_url_flag = myargs.ncbi_nlm_url

ngd_url = "https://ftp.ncbi.nih.gov/genomes"
Expand All @@ -125,6 +127,12 @@ def skder_main():
sys.stderr.write('Selection mode requested not valid, must be either "greedy" or "dynamic".')
sys.exit(1)

try:
assert(gtdb_release in VALID_GTDB_RELEASES)
except:
sys.stderr.write('GTDB release requested is not valid. Valid options include: %s\n' % ' '.join(VALID_GTDB_RELEASES))
sys.exit(1)

if os.path.isdir(outdir):
sys.stderr.write("Output directory already exists! Overwriting in 5 seconds, but only where needed will use checkpoint files to skip certain steps...\n")
sleep(5)
Expand Down Expand Up @@ -160,13 +168,13 @@ def skder_main():
# if requested.
sys.stdout.write("GTDB listing file not available, using wget to download it.\n")
logObject.info("\nGTDB listing file not available, using wget to download it.")
wget_cmd = ['wget', 'https://github.com/Kalan-Lab/lsaBGC/raw/main/db/GTDB_R214_Information.txt.gz', '-P', outdir]
gtdb_listing_file = outdir + "GTDB_R214_Information.txt.gz"
wget_cmd = ['wget', 'https://github.com/raufs/gtdb_gca_to_taxa_mappings/raw/main/GTDB_' + gtdb_release + '_Information.txt.gz', '-P', outdir]
gtdb_listing_file = outdir + "GTDB_" + gtdb_release + "_Information.txt.gz"
util.runCmd(wget_cmd, logObject, check_files=[gtdb_listing_file])

genbank_accession_listing_file = outdir + 'NCBI_Genbank_Accession_Listing.txt'
sys.stdout.write("--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB and NCBI's GenBank db\n" % taxa_name)
logObject.info("\n--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB and NCBI's GenBank db" % taxa_name)
sys.stdout.write("--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB %s\n" % (taxa_name, gtdb_release))
logObject.info("\n--------------------\nStep 0\n--------------------\nBeginning by assessing which genomic assemblies are available for the taxa %s in GTDB %s" % (taxa_name, gtdb_release))

if not os.path.isfile(genbank_accession_listing_file):
genbank_accession_listing_handle = open(genbank_accession_listing_file, 'w')
Expand Down Expand Up @@ -214,16 +222,7 @@ def skder_main():
ngd_real_cmd = ['ncbi-genome-download', '--formats', 'fasta', '--retries', '2', '--section', 'genbank', '-u', ngd_url,
'-A', genbank_accession_listing_file, '-o', genomes_directory, '--flat-output', 'bacteria']
util.runCmd(ngd_real_cmd, logObject, check_directories=[genomes_directory])
"""
# files get are not uncompressed anymore as of v1.0.9
uncompress_cmds = []
for f in os.listdir(genomes_directory):
uncompress_cmds.append(['gunzip', genomes_directory + f])#, logObject])
p = multiprocessing.Pool(cpus)
p.map(util.multiProcess, uncompress_cmds)
p.close()
"""

gca_to_species_name = {}
with gzip.open(gtdb_listing_file, 'rt') as ogtdb:
for line in ogtdb:
Expand Down Expand Up @@ -260,12 +259,7 @@ def skder_main():
if not suffix in ACCEPTED_SUFFICES: continue
if sanity_check:
assert(util.is_fasta(gf))
if avoid_symlink_flag:
gf_listing_handle.write(gf + '\n')
else:
sl_gf = symlink_genomes_directory + gf.split('/')[-1]
os.symlink(gf, sl_gf)
gf_listing_handle.write(sl_gf + '\n')
gf_listing_handle.write(gf + '\n')
gf_listing_handle.close()

number_of_genomes = None
Expand All @@ -280,14 +274,23 @@ def skder_main():
sys.exit(1)

# calculate N50s
n50_dir = outdir + 'Assembly_N50s/'
util.setupDirectories([n50_dir])
n50_inputs = []
genomes = []
with open(all_genomes_listing_file) as oaglf:
for line in oaglf:
genome_path = line.strip()
genome_file = genome_path.split('/')[-1]
n50_inputs.append([genome_path, n50_dir + genome_file + '_n50.txt'])
genomes.append(genome_path)

genome_count = len(genomes)
chunk_size = math.ceil(genome_count/cpus)
genome_chunks = util.divide_chunks(genomes, chunk_size)

n50_dir = outdir + 'Assembly_N50s/'
util.setupDirectories([n50_dir])
n50_inputs = []
for i, gc in enumerate(genome_chunks):
n50_chunk_file = n50_dir + 'chunk_' + str(i) + '.txt'
n50_inputs.append([gc, n50_chunk_file])

p = multiprocessing.Pool(cpus)
p.map(util.compute_n50, n50_inputs)
p.close()
Expand All @@ -296,6 +299,9 @@ def skder_main():
concat_n50_result_file = outdir + 'Concatenated_N50.txt'
os.system('time find %s -maxdepth 1 -type f | xargs cat >> %s' % (n50_dir, concat_n50_result_file))

# remove the directory with N50 stats after concatenating info into a single file.
shutil.rmtree(n50_dir)

# run skani triangle
skani_result_file = outdir + 'Skani_Triangle_Edge_Output.txt'
skani_triangle_cmd = ['skani', 'triangle', '-l', all_genomes_listing_file, # '-s', str(percent_identity_cutoff), <- no longer using this since v1.0.7
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "skDER"
authors = [{name="Rauf Salamzade", email="[email protected]"}]
version = "1.0.10"
version = "1.1.0"
description = "Program to select distinct representatives from an input set of microbial genomes."

[build-system]
Expand Down
23 changes: 18 additions & 5 deletions src/skDER/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys
import logging
import traceback
import shutil
import pyfastx
import subprocess
from Bio import SeqIO
Expand Down Expand Up @@ -96,15 +95,29 @@ def is_fasta(fasta):
except:
return False

# Yield successive n-sized
# chunks from l.
def divide_chunks(l, n):
# function taken from: https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
# looping till length l
for i in range(0, len(l), n):
yield l[i:i + n]

def compute_n50(inputs):
"""
Uses pyfastx
"""
input_fasta, output_file = inputs
fa = pyfastx.Fasta(input_fasta, build_index=True)
n50, _ = fa.nl(50)
input_fastas, output_file = inputs
output_handle = open(output_file, 'w')
output_handle.write(input_fasta + '\t' + str(n50) + '\n')
for input_fasta in input_fastas:
fa = pyfastx.Fasta(input_fasta, build_index=True)
n50, _ = fa.nl(50)
try:
index_file = input_fasta + '.fxi'
os.remove(index_file)
except:
pass
output_handle.write(input_fasta + '\t' + str(n50) + '\n')
output_handle.close()

def multiProcess(inputs):
Expand Down

0 comments on commit c5b9a57

Please sign in to comment.