update to v1.2.3

raufs · Jul 12, 2024 · 082e341 · 082e341
1 parent 14bf52a
commit 082e341
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -68,6 +68,8 @@ skDER features two distinct algorithms for dereplication (details can be found b
 
 #### CiDDER
 
+***Currently only for bacteria - because it uses pyrodigal for gene calling!***
+
 In v1.2.0, we also introduced a second program called CiDDER (CD-hit based DEReplication) - which allows for optimizing selection of a minimal number of genomes that achieve some level of saturation of the pan-genome of the full set of genomes (see below for details). Note, CD-HIT determines protein clusters, not proper ortholog groups, and as such an approximation is made of the pan-genome space being sampled by representative genomes.
 
 ## Details on Dereplication Algorithms
@@ -145,7 +147,7 @@ The help function should return the following:
 
 ```
 usage: skder [-h] [-g GENOMES [GENOMES ...]] [-t TAXA_NAME] [-r GTDB_RELEASE] -o OUTPUT_DIRECTORY [-d DEREPLICATION_MODE] [-i PERCENT_IDENTITY_CUTOFF] [-tc] [-f ALIGNED_FRACTION_CUTOFF]
-             [-a MAX_AF_DISTANCE_CUTOFF] [-p SKANI_TRIANGLE_PARAMETERS] [-c CPUS] [-s] [-n] [-l] [-b] [-u] [-v]
+             [-a MAX_AF_DISTANCE_CUTOFF] [-p SKANI_TRIANGLE_PARAMETERS] [-s] [-n] [-l] [-b] [-u] [-c THREADS] [-v]
 
 	Program: skder
 	Author: Rauf Salamzade
@@ -169,46 +171,47 @@ options:
                         Genome assembly files in (gzipped) FASTA format
                         (accepted suffices are: *.fasta,
                         *.fa, *.fas, or *.fna) [Optional].
-  -t TAXA_NAME, --taxa_name TAXA_NAME
+  -t TAXA_NAME, --taxa-name TAXA_NAME
                         Genus or species identifier from GTDB for which to
                         download genomes for and include in
                         dereplication analysis [Optional].
-  -r GTDB_RELEASE, --gtdb_release GTDB_RELEASE
+  -r GTDB_RELEASE, --gtdb-release GTDB_RELEASE
                         Which GTDB release to use if -t argument issued [Default is R220].
-  -o OUTPUT_DIRECTORY, --output_directory OUTPUT_DIRECTORY
+  -o OUTPUT_DIRECTORY, --output-directory OUTPUT_DIRECTORY
                         Output directory.
-  -d DEREPLICATION_MODE, --dereplication_mode DEREPLICATION_MODE
+  -d DEREPLICATION_MODE, --dereplication-mode DEREPLICATION_MODE
                         Whether to use a "dynamic" (more concise) or "greedy" (more
                         comprehensive) approach to selecting representative genomes.
                         [Default is "dynamic"]
-  -i PERCENT_IDENTITY_CUTOFF, --percent_identity_cutoff PERCENT_IDENTITY_CUTOFF
+  -i PERCENT_IDENTITY_CUTOFF, --percent-identity-cutoff PERCENT_IDENTITY_CUTOFF
                         ANI cutoff for dereplication [Default is 99.0].
-  -tc, --test_cutoffs   Assess clustering using various pre-selected cutoffs.
-  -f ALIGNED_FRACTION_CUTOFF, --aligned_fraction_cutoff ALIGNED_FRACTION_CUTOFF
+  -tc, --test-cutoffs   Assess clustering using various pre-selected cutoffs.
+  -f ALIGNED_FRACTION_CUTOFF, --aligned-fraction-cutoff ALIGNED_FRACTION_CUTOFF
                         Aligned cutoff threshold for dereplication - only needed by
                         one genome [Default is 90.0].
-  -a MAX_AF_DISTANCE_CUTOFF, --max_af_distance_cutoff MAX_AF_DISTANCE_CUTOFF
+  -a MAX_AF_DISTANCE_CUTOFF, --max-af-distance-cutoff MAX_AF_DISTANCE_CUTOFF
                         Maximum difference for aligned fraction between a pair to
                         automatically disqualify the genome with a higher
                         AF from being a representative.
-  -p SKANI_TRIANGLE_PARAMETERS, --skani_triangle_parameters SKANI_TRIANGLE_PARAMETERS
+  -p SKANI_TRIANGLE_PARAMETERS, --skani-triangle-parameters SKANI_TRIANGLE_PARAMETERS
                         Options for skani triangle. Note ANI and AF cutoffs
                         are specified separately and the -E parameter is always
                         requested. [Default is ""].
-  -c CPUS, --cpus CPUS  Number of CPUs to use.
-  -s, --sanity_check    Confirm each FASTA file provided or downloaded is actually
+  -s, --sanity-check    Confirm each FASTA file provided or downloaded is actually
                         a FASTA file. Makes it slower, but generally
                         good practice.
-  -n, --determine_clusters
+  -n, --determine-clusters
                         Perform secondary clustering to assign non-representative
                         genomes to their closest representative genomes.
   -l, --symlink         Symlink representative genomes in results subdirectory
                         instead of performing a copy of the files.
-  -b, --index_locally   Build indices locally instead of in the directory of input genomes.
-  -u, --ncbi_nlm_url    Try using the NCBI ftp address with '.nlm' for
+  -b, --index-locally   Build indices locally instead of in the directory of input genomes.
+  -u, --ncbi-nlm-url    Try using the NCBI ftp address with '.nlm' for
                         ncbi-genome-download if there are issues.
+  -c THREADS, --threads THREADS
+                        Number of threads/processes to use [Default is 1].
   -v, --version         Report version of skDER.
-  ```
+```
 
 ### Usage for CiDDER
 
@@ -220,11 +223,8 @@ cidder -h
 The help function should return the following:
 
 ```
-usage: cidder [-h] [-g GENOMES [GENOMES ...]] [-t TAXA_NAME] [-r GTDB_RELEASE]
-              -o OUTPUT_DIRECTORY [-p CD_HIT_PARAMS] [-mg] [-e]
-              [-n NEW_PROTEINS_NEEDED] [-ts TOTAL_SATURATION]
-              [-mgs MULTI_GENOME_SATURATION] [-s] [-l] [-c CPUS] [-m MEMORY]
-              [-u] [-v]
+usage: cidder [-h] [-g GENOMES [GENOMES ...]] [-t TAXA_NAME] [-r GTDB_RELEASE] -o OUTPUT_DIRECTORY [-p CD_HIT_PARAMS] [-mg] [-e] [-a NEW_PROTEINS_NEEDED] [-ts TOTAL_SATURATION]
+              [-mgs MULTI_GENOME_SATURATION] [-s] [-n] [-ns] [-l] [-u] [-c THREADS] [-m MEMORY] [-v]
 
 	Program: cidder
 	Author: Rauf Salamzade
@@ -267,28 +267,35 @@ options:
                         (don't set threads or memory - those are done by default in cidder) and surround by quotes
                         [Default is: "-n 5 -c 0.95 -aL 0.75 -aS 0.90"]
   -mg, --metagenome-mode
-                        Run pyrodigal using metagenome mode [Default is False].
+                        Run pyrodigal using metagenome mode.
   -e, --include-edge-orfs
-                        Include proteins from ORFs that hang off the edge of a contig/scaffold
-                        [Default is False].
-  -n NEW_PROTEINS_NEEDED, --new-proteins-needed NEW_PROTEINS_NEEDED
+                        Include proteins from ORFs that hang off the edge of a contig/scaffold.
+  -a NEW_PROTEINS_NEEDED, --new-proteins-needed NEW_PROTEINS_NEEDED
                         The number of new protein clusters needed to add [Default is 0].
   -ts TOTAL_SATURATION, --total-saturation TOTAL_SATURATION
                         The percentage of total proteins clusters needed to stop representative
                         genome selection [Default is 90.0].
   -mgs MULTI_GENOME_SATURATION, --multi-genome-saturation MULTI_GENOME_SATURATION
                         The percentage of total multi-genome protein clusters needed to stop
                         representative genome selection [Default is 100.0].
-  -s, --sanity_check    Confirm each FASTA file provided or downloaded is actually
+  -s, --sanity-check    Confirm each FASTA file provided or downloaded is actually
                         a FASTA file. Makes it slower, but generally
                         good practice.
+  -n, --determine-clusters
+                        Perform secondary clustering to assign non-representative
+                        genomes to their closest representative genomes based on shared protein clusters.
+  -ns, --determine-clusters-skani
+                        Perform secondary clustering to assign non-representative
+                        genomes to their closest representative genomes based on skani-computed ANI.
   -l, --symlink         Symlink representative genomes in results subdirectory
                         instead of performing a copy of the files.
-  -c CPUS, --cpus CPUS  Number of CPUs to use [Default is 1].
+  -u, --ncbi-nlm-url    Try using the NCBI ftp address with '.nlm' for
+                        ncbi-genome-download if there are issues.
+  -c THREADS, --threads THREADS
+                        Number of threads/processes to use [Default is 1].
   -m MEMORY, --memory MEMORY
                         The memory limit for CD-HIT in Gigabytes [Default is 0 = unlimited].
-  -u, --ncbi_nlm_url    Try using the NCBI ftp address with '.nlm' for
-                        ncbi-genome-download if there are issues.
+  -v, --version         Report version of CiDDER.
 ```
 
 ## Citation notice
@@ -314,7 +321,7 @@ If you use CiDDER, please also consider citing pyrodigal (for gene-calling) and
 
 ## Acknowledgments
 
-We thank Titus Brown, Tessa Pierce-Ward, and Karthik Anantharaman for helpful discussions on the development of skDER/CiDDER - in particular the idea to directly asses the pan-genome space sampled by representative genomes. 
+We thank Titus Brown, Tessa Pierce-Ward, and Karthik Anantharaman for helpful discussions on the development of skDER/CiDDER - in particular the idea to directly asses the pan-genome space sampled by representative genomes. We also thank users on GitHub issues for suggesting ideas for new features.
 
 ## LICENSE
 

diff --git a/bin/cidder b/bin/cidder
@@ -96,9 +96,9 @@ def create_parser():
 	parser.add_argument('-n', '--determine-clusters', action='store_true', help="Perform secondary clustering to assign non-representative\ngenomes to their closest representative genomes based on shared protein clusters.", required=False, default=False)
 	parser.add_argument('-ns', '--determine-clusters-skani', action='store_true', help="Perform secondary clustering to assign non-representative\ngenomes to their closest representative genomes based on skani-computed ANI.", required=False, default=False)
 	parser.add_argument('-l', '--symlink', action='store_true', help="Symlink representative genomes in results subdirectory\ninstead of performing a copy of the files.", required=False, default=False)
-	parser.add_argument('-c', '--cpus', type=int, help="Number of CPUs to use [Default is 1].", required=False, default=1)
-	parser.add_argument('-m', '--memory', type=float, help="The memory limit for CD-HIT in Gigabytes [Default is 0 = unlimited].", required=False, default=0)
 	parser.add_argument('-u', '--ncbi-nlm-url', action='store_true', help="Try using the NCBI ftp address with '.nlm' for\nncbi-genome-download if there are issues.", required=False, default=False)
+	parser.add_argument('-c', '--threads', type=int, help="Number of threads/processes to use [Default is 1].", required=False, default=1)
+	parser.add_argument('-m', '--memory', type=float, help="The memory limit for CD-HIT in Gigabytes [Default is 0 = unlimited].", required=False, default=0)
 	parser.add_argument('-v', '--version', action='store_true', help="Report version of CiDDER.", required=False, default=False)
 	args = parser.parse_args()
 	return args
@@ -125,7 +125,7 @@ def cidder_main():
 	multigenome_saturation_cutoff = myargs.multi_genome_saturation
 	sanity_check = myargs.sanity_check
 	symlink = myargs.symlink
-	cpus = myargs.cpus
+	threads = myargs.threads
 	memory = myargs.memory
 	ncbi_nlm_url_flag = myargs.ncbi_nlm_url
 	determine_clusters_flag = myargs.determine_clusters
@@ -297,7 +297,7 @@ def cidder_main():
 		sys.exit(1)
 
 	try:
-		p = multiprocessing.Pool(cpus)
+		p = multiprocessing.Pool(threads)
 		p.map(util.multiProcess, pyrodigal_cmds)
 		p.close()
 	except Exception as e:
@@ -328,7 +328,7 @@ def cidder_main():
 	cdhit_mem = 1000.0*memory
 	if cdhit_mem > 0 and cdhit_mem < 1:
 		cdhit_mem = 1
-	cdhit_cmd = ['cd-hit', '-d', '0', '-T', str(cpus), '-M', str(cdhit_mem), cd_hit_params, '-i', combined_proteome_faa, '-o', cdhit_result_prefix]
+	cdhit_cmd = ['cd-hit', '-d', '0', '-T', str(threads), '-M', str(cdhit_mem), cd_hit_params, '-i', combined_proteome_faa, '-o', cdhit_result_prefix]
 	try:
 		util.runCmd(cdhit_cmd, logObject, check_files=[cdhit_cluster_file])
 	except:
@@ -484,7 +484,7 @@ def cidder_main():
 		# run skani triangle
 		skani_result_file = outdir + 'Skani_Triangle_Edge_Output.txt'
 
-		skani_triangle_cmd = ['skani', 'triangle', '-l', all_genomes_listing_file, '--min-af', '80.0', '-E', '-t', str(cpus), '-o', skani_result_file]
+		skani_triangle_cmd = ['skani', 'triangle', '-l', all_genomes_listing_file, '--min-af', '80.0', '-E', '-t', str(threads), '-o', skani_result_file]
 		util.runCmd(skani_triangle_cmd, logObject, check_files=[skani_result_file])
 
 		cidder_cluster_result_file = outdir + 'CiDDER_skani_Clustering.txt'

diff --git a/bin/skder b/bin/skder
@@ -89,12 +89,12 @@ def create_parser():
 	parser.add_argument('-f', '--aligned-fraction-cutoff', type=float, help="Aligned cutoff threshold for dereplication - only needed by\none genome [Default is 90.0].", required=False, default=90.0)
 	parser.add_argument('-a', '--max-af-distance-cutoff', type=float, help="Maximum difference for aligned fraction between a pair to\nautomatically disqualify the genome with a higher\nAF from being a representative.", required=False, default=10.0)
 	parser.add_argument('-p', '--skani-triangle-parameters', help="Options for skani triangle. Note ANI and AF cutoffs\nare specified separately and the -E parameter is always\nrequested. [Default is \"\"].", default="", required=False)
-	parser.add_argument('-c', '--cpus', type=int, help="Number of CPUs to use.", required=False, default=1)
 	parser.add_argument('-s', '--sanity-check', action='store_true', help="Confirm each FASTA file provided or downloaded is actually\na FASTA file. Makes it slower, but generally\ngood practice.", required=False, default=False)
 	parser.add_argument('-n', '--determine-clusters', action='store_true', help="Perform secondary clustering to assign non-representative\ngenomes to their closest representative genomes.", required=False, default=False)
 	parser.add_argument('-l', '--symlink', action='store_true', help="Symlink representative genomes in results subdirectory\ninstead of performing a copy of the files.", required=False, default=False)
 	parser.add_argument('-b', '--index-locally', action='store_true', help="Build indices locally instead of in the directory of input genomes.", required=False, default=False)
 	parser.add_argument('-u', '--ncbi-nlm-url', action='store_true', help="Try using the NCBI ftp address with '.nlm' for\nncbi-genome-download if there are issues.", required=False, default=False)
+	parser.add_argument('-c', '--threads', type=int, help="Number of threads/processes to use [Default is 1].", required=False, default=1)
 	parser.add_argument('-v', '--version', action='store_true', help="Report version of skDER.", required=False, default=False)
 	args = parser.parse_args()
 	return args
@@ -119,7 +119,7 @@ def skder_main():
 	skani_triangle_parameters = myargs.skani_triangle_parameters
 	max_af_distance_cutoff = myargs.max_af_distance_cutoff
 	test_cutoffs_flag = myargs.test_cutoffs
-	cpus = myargs.cpus
+	threads = myargs.threads
 	symlink_flag = myargs.symlink
 	determine_clusters_flag = myargs.determine_clusters
 	sanity_check = myargs.sanity_check
@@ -290,7 +290,7 @@ def skder_main():
 			genomes.append(genome_path)
 
 	genome_count = len(genomes)
-	chunk_size = math.ceil(genome_count/cpus)
+	chunk_size = math.ceil(genome_count/threads)
 	genome_chunks = util.divide_chunks(genomes, chunk_size)
 
 	n50_dir = outdir + 'Assembly_N50s/'
@@ -300,7 +300,7 @@ def skder_main():
 		n50_chunk_file = n50_dir + 'chunk_' + str(i) + '.txt'
 		n50_inputs.append([gc, n50_chunk_file, index_locally_flag])
 
-	p = multiprocessing.Pool(cpus)
+	p = multiprocessing.Pool(threads)
 	p.map(util.compute_n50, n50_inputs)
 	p.close()
 
@@ -314,13 +314,13 @@ def skder_main():
 	# run skani triangle
 	skani_result_file = outdir + 'Skani_Triangle_Edge_Output.txt'
 	skani_triangle_cmd = ['skani', 'triangle', '-l', all_genomes_listing_file, 
-			      '--min-af', str(aligned_fraction_cutoff), '-E', skani_triangle_parameters, '-t', str(cpus), '-o', skani_result_file]
+			      '--min-af', str(aligned_fraction_cutoff), '-E', skani_triangle_parameters, '-t', str(threads), '-o', skani_result_file]
 	if test_cutoffs_flag:
 		min_af_cutoff = min(PRESELECTED_AF_CUTOFFS)
 		if selection_mode == 'dynamic':
 			min_af_cutoff = max([min_af_cutoff - 20.0, 0.0])
 		skani_triangle_cmd = ['skani', 'triangle', '-l', all_genomes_listing_file,
-			      '--min-af', str(min_af_cutoff), '-E', skani_triangle_parameters, '-t', str(cpus), '-o', skani_result_file]
+			      '--min-af', str(min_af_cutoff), '-E', skani_triangle_parameters, '-t', str(threads), '-o', skani_result_file]
 	util.runCmd(skani_triangle_cmd, logObject, check_files=[skani_result_file])
 
 	if test_cutoffs_flag:

diff --git a/run_tests.sh b/run_tests.sh
@@ -7,4 +7,4 @@ cd test_case/
 
 # run skder on test set of Cutibacterium granulosum genomes present in GTDB R214.
 skder -g Cutibacterium_granulosum_Genomes_in_GTDB_R214/*.fna -o skder_results/ -c 4 -n
-cidder -g Cutibacterium_granulosum_Genomes_in_GTDB_R214/*.fna -o cidder_results/ -c 4
+cidder -g Cutibacterium_granulosum_Genomes_in_GTDB_R214/*.fna -o cidder_results/ -c 4 -n -ns
diff --git a/test_case.tar.gz b/test_case.tar.gz