New format yaml files for the tutorial

arumugamlab · Jun 7, 2023 · 4e3d8b6 · 4e3d8b6
1 parent 8e378af
commit 4e3d8b6
Show file tree

Hide file tree

Showing 11 changed files with 474 additions and 97 deletions.
diff --git a/tutorial/configuration/data_integration.yaml b/tutorial/configuration/data_integration.yaml
@@ -4,31 +4,38 @@
 PROJECT: IBD_tutorial
 working_dir: /mypath/IBD_tutorial
 omics: metaG_metaT
-local_dir: /scratch/johndoe/IBD_tutorial
-minto_dir: /server/apps/MIntO
+local_dir: /tmp/user/
+minto_dir: /mypath/MIntO
 METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt
 
 ######################
 # Program settings
 ######################
 alignment_identity: 95
-abundance_normalization: TPM
+abundance_normalization: MG
 map_reference: MAG
+MIN_mapped_reads: 2
 
 MERGE_threads: 4
 MERGE_memory: 5
 
-ANNOTATION_file:
+MAG_omics: metaG
 
+ANNOTATION_file:
 
 # List annotation IDs matching to generate function profiles. 
 # If map_reference= 'MAG' or 'reference_genome', this list correspond to:
-# 'eggNOG_OGs','KEGG_Pathway','KEGG_Module','KEGG_Module','PFAMs','dbCAN.mod' and 'dbCAN.enzclass. 
+# 'eggNOG_OGs','KEGG_Pathway','KEGG_Module','KEGG_KO','PFAMs','dbCAN.mod' and 'dbCAN.enzclass.
 # The names should match the ANNOTATION_file column names.
 #   E.g.:
 # - eggNOG_OGs
 # - KEGG_Pathway
 ANNOTATION_ids:
+ - eggNOG_OGs
+ - KEGG_Pathway
+ - KEGG_Module
+ - KEGG_KO
+ - PFAMs
  - dbCAN.mod
  - dbCAN.enzclass
 
diff --git a/tutorial/configuration/metaG/QC_1.yaml b/tutorial/configuration/metaG/QC_1.yaml
@@ -5,8 +5,8 @@
 PROJECT: IBD_tutorial
 working_dir: /mypath/IBD_tutorial
 omics: metaG
-local_dir: /scratch/johndoe/IBD_tutorial
-minto_dir: /server/apps/MIntO
+local_dir: /tmp/user/
+minto_dir: /mypath/MIntO
 METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt
 raw_reads_dir: /mypath/IBD_tutorial_raw/metaG
 

diff --git a/tutorial/configuration/metaG/QC_2.yaml b/tutorial/configuration/metaG/QC_2.yaml
@@ -1,39 +1,89 @@
 ######################
 # General settings
 ######################
+
 PROJECT: IBD_tutorial
 working_dir: /mypath/IBD_tutorial
 omics: metaG
-local_dir: /scratch/johndoe/IBD_tutorial
-minto_dir: /server/apps/MIntO
+local_dir: /tmp/user/
+minto_dir: /mypath/MIntO
 METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt
 
-######################
+########################################
 # Program settings
-######################
+########################################
+
+#########################
 # Read length filtering
-TRIMMOMATIC_threads: 5
-TRIMMOMATIC_memory: 4
-TRIMMOMATIC_minlen: 134
+#########################
+
+READ_minlen: 134
 
+#########################
 # Host genome filtering
+#########################
+
+# bwa-mem2 index files will be stored at: <PATH_host_genome>/BWA_index/<NAME_host_genome>.*
+# If it already exists, then it will be used directly.
+# If not, a fasta file should exist as: <PATH_host_genome>/<NAME_host_genome>
+# This will be build into index files using:
+#    bwa-mem2 index -p <PATH_host_genome>/BWA_index/<NAME_host_genome> <PATH_host_genome>/<NAME_host_genome>
+# Please do not use '.fasta' or '.fa' or '.fna' extension for the fasta file. Name it without extension.
+
 PATH_host_genome: /mypath/IBD_tutorial
 NAME_host_genome: build_hg18_subset.fna
 BWA_index_host_memory: 40
 BWA_host_threads: 8
 BWA_host_memory: 40
 
+##################################
 # Assembly-free taxonomy profiling
+##################################
+
+# Following values for 'taxa_profile' are supported: 
+#    1. metaphlan - relative abundance using MetaPhlAn
+#    2. motus_raw - read counts using mOTUs
+#    3. motus_rel - relative abundance using mOTUs
+# Comma-delimited combination of multiple options also supported
+# Eg:
+#    taxa_profile: metaphlan,motus_rel
 TAXA_threads: 8
 TAXA_memory: 10
-taxa_profile: motus_rel
+taxa_profile: motus_rel 
 
+#####################
 # Analysis parameters
+#####################
+
+# MAIN_factor  - the main factor in the metadata file to differentiate in visualization (using color)
+# PLOT_factor2 - the second factor in the metadata file to differentiate in visualization (using shape)
+# PLOT_time    - name of the factor in the metadata file denoting time (e.g. hour, day)
+
 MAIN_factor: participant_ID2
 PLOT_factor2:
 PLOT_time: week_n
 
+######################
+# Optionally, do you want to merge replicates or make pseudo samples
+# E.g:
+# MERGE_ILLUMINA_SAMPLES:
+#  - merged=rep1+rep2+rep3
+#
+# The above directive will combine 3 samples (rep1, rep2 and rep3)
+# after the last step into a new sample called 'merged'. Now you can remove
+# rep1, rep2 and rep3 from assembly, MAG generation and profiling steps.
+# Please note that METADATA file must have an entry for 'merged' as well,
+# otherwise QC_2 step will fail.
+# Having extra entries in METADATA file does not affect you in any way.
+######################
+
+#MERGE_ILLUMINA_SAMPLES:
+
+
+######################
 # Input data
+######################
+
 # ILLUMINA section:
 # -----------------
 # List of illumina samples that will be filtered by read length.

diff --git a/tutorial/configuration/metaG/assembly.yaml b/tutorial/configuration/metaG/assembly.yaml
@@ -2,30 +2,35 @@
 # General settings
 ######################
 PROJECT: IBD_tutorial
-working_dir: /mypath/IBD_tutorial_output
+working_dir: /mypath/IBD_tutorial
 omics: metaG
-local_dir: /mypath/tmp/IBD_tutorial_metaG
+local_dir: /tmp/user/
 minto_dir: /mypath/MIntO
-METADATA: /mypath/MIntO/tutorial/metadata/tutorial_metadata.txt
+METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt
 
 ######################
 # Program settings
 ######################
 # MetaSPAdes settings
 #
 METASPADES_qoffset: auto
-METASPADES_threads: 10
+METASPADES_threads: 16
 METASPADES_memory: 8
 METASPADES_hybrid_max_k: 99
 METASPADES_illumina_max_k: 99
 
 # MEGAHIT settings
 #
-# Total used memory is not given here but calculated to be 10G per sample in the coassembly.
-# Default parameter is 5 for metaG samples with 10M reads.
-# Please make sure that there is enough RAM on the server.
-MEGAHIT_threads: 10
-MEGAHIT_memory: 1
+# Note on MEGAHIT_memory:
+#     MEGAHIT's memory requirement scales with sample count and sample size.
+#     Default MEGAHIT_memory below is 10 Gigabytes per sample (plenty for metaG samples of 10M reads).
+#     MEGAHIT_memory parameter represents per-sample memory in Gigabytes
+#     during the first attempt. If it is not enough and MEGAHIT fails, then
+#     each successive attempt will increase per-sample memory by 6 Gibabytes
+#     until it succeeds or reaches max_attempts from snakemake.
+#     Please make sure that there is enough RAM on the server.
+MEGAHIT_memory: 10
+MEGAHIT_threads: 32
 MEGAHIT_presets:
  - meta-sensitive
  - meta-large
@@ -42,24 +47,49 @@ MEGAHIT_presets:
 #    want one parameter to be used, please comment everything else.
 # 2. If nothing is listed here, then MetaFlye won't be run.
 #    If you just want our default parameters above, then here is a possible option:
-#      metaflye-default: 
+#      metaflye-default: ""
 # 3. 'tres-o3000-3x' is valid for flye 2.8.3. From 2.9.x, --plasmids and --trestle are
 #    not valid. So please use valid options if you are using newer versions of flye.
 METAFLYE_presets:
   tres-o3000-3x: --min-overlap 3000 --iterations 3
-  #metaflye-default: 
+  #metaflye-default: ""
 
 # BWA settings
 # Used when mapping reads back to contigs
 #
-BWA_threads: 8
+BWA_threads: 24
 
 # samtools settings
 # Used when sorting bam files
 #
 SAMTOOLS_sort_threads: 4
 SAMTOOLS_sort_memory_gb: 20
 
+###############################
+# Binning preparation settings
+###############################
+
+# Whether to use contigs or scaffolds from SPAdes
+SPADES_CONTIGS_OR_SCAFFOLDS: scaffolds
+
+# minimum contig/scaffold fasta length
+MIN_FASTA_LENGTH: 2500
+
+# assembly batch size for mapping reads to combined contig sets
+CONTIG_MAPPING_BATCH_SIZE: 100
+
+# Should we exclude any assembly type during MAG generation?
+# E.g., if you make MAGs from metaT, individual sample assemblies
+# will be quite fragmented. If you have several co-assemblies, then
+# ignoring 'illumina_single' might improve your MAG quality. This can
+# be achieved using:
+# ---
+# EXCLUDE_ASSEMBLY_TYPES:
+#     - illumina_single
+# ---
+#
+# EXCLUDE_ASSEMBLY_TYPES:
+
 # Input data
 
 # HYBRID section:
@@ -70,7 +100,7 @@ SAMTOOLS_sort_memory_gb: 20
 #   Hybrid assemblies will be performed for each combination of nanopore and illumina samples.
 #   E.g.:
 #
-#   N1: I3+I4 
+#   N1: I3+I4
 #
 #   The above will result in 2 hybrid assemblies: 'N1-I3' and 'N1-I4'
 #
@@ -86,19 +116,6 @@ SAMTOOLS_sort_memory_gb: 20
 #
 #HYBRID:
 
-# COASSEMBLY section:
-# -------------------
-# MEGAHIT coassembly will be performed using the following definitions.
-# Each coassembly is named in the LHS, and corresponding illumina sample(s) are in RHS (delimited by '+').
-# One coassembly will be performed for each line. 
-# E.g. 'Subject1: I3+I4' will result in 1 coassembly: 'Subject1' using I3 and I4 data.
-# Memory per coassembly is calculated to be 10G per sample in the coassembly.
-# Please make sure that there is enough RAM on the server.
-#
-COASSEMBLY:
-  Full: CD136+CD138+CD140+CD142+CD146+CD237+CD238+CD240+CD242+CD244
-  CD1: CD136+CD138+CD140+CD142+CD146
-  CD2: CD237+CD238+CD240+CD242+CD244
 # NANOPORE section:
 # -----------------
 # List of nanopore samples that will be assembled individually using MetaFlye.
@@ -124,3 +141,17 @@ ILLUMINA:
 - CD240
 - CD242
 - CD244
+
+# COASSEMBLY section:
+# -------------------
+# MEGAHIT coassembly will be performed using the following definitions.
+# Each coassembly is named in the LHS, and corresponding illumina sample(s) are in RHS (delimited by '+').
+# One coassembly will be performed for each line.
+# E.g. 'Subject1: I3+I4' will result in 1 coassembly: 'Subject1' using I3 and I4 data.
+# Memory per coassembly is calculated to be 10G per sample in the coassembly.
+# Please make sure that there is enough RAM on the server.
+#
+COASSEMBLY:
+  Full: CD136+CD138+CD140+CD142+CD146+CD237+CD238+CD240+CD242+CD244
+  CD1: CD136+CD138+CD140+CD142+CD146
+  CD2: CD237+CD238+CD240+CD242+CD244
diff --git a/tutorial/configuration/metaG/mags_generation.yaml b/tutorial/configuration/metaG/mags_generation.yaml
@@ -2,46 +2,42 @@
 # General settings
 ######################
 PROJECT: IBD_tutorial
-working_dir: /mypath/IBD_tutorial_output
+working_dir: /mypath/IBD_tutorial
 omics: metaG
-local_dir: /mypath/tmp/IBD_tutorial_metaG
+local_dir: /tmp/user/
 minto_dir: /mypath/MIntO
-METADATA: /mypath/MIntO/tutorial/metadata/tutorial_metadata.txt
+METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt
 
 ######################
 # Program settings
 ######################
 # COMMON PARAMETERS
 #
-MIN_FASTA_LENGTH: 500000
+MIN_FASTA_LENGTH: 2500
+MIN_MAG_LENGTH: 500000
+BINSPLIT_CHAR: _
 
 # VAMB settings
 #
 BINNERS:
-- vamb_256
-- vamb_384
-- vamb_512
-- vamb_768
+- aaey
+- aaez
+- vae384
 
-VAMB_THREADS: 8
-VAMB_memory: 1
+VAMB_THREADS: 24
+VAMB_memory: 20
 
 # Use GPU in VAMB:
-# could be yes or not
+# could be yes or no
 VAMB_GPU: no
 
 
 # CHECKM settings
 #
-CHECKM_THREADS: 8
-CHECKM_memory: 5
-
-# checkm threshold
-CHECKM_COMPLETENESS: 90  #higher than this
+CHECKM_COMPLETENESS: 90  # higher than this
 CHECKM_CONTAMINATION: 5  # lower than this
-
-# Clean up checkm files?
-CLEAN_CHECKM: yes # can be yes or not
+CHECKM_BATCH_SIZE: 50    # Process MAGs with this batch size
+CHECKM_DATABASE: /mypath/MIntO/data/CheckM2_database/uniref100.KO.1.dmnd
 
 # COVERM settings
 #
@@ -51,7 +47,7 @@ COVERM_memory: 5
 # SCORING THE BEST GENOMES settings
 #
 # this could be checkm or genome
-SCORE_METHOD: checkm 
+SCORE_METHOD: checkm
 
 
 # PROKKA settings
@@ -60,10 +56,14 @@ RUN_PROKKA: yes
 PROKKA_CPUS: 8
 PROKKA_memory: 5
 
-# PHYLOPHLAN METAGENOMICS settings
+# MAG taxonomy settings
 #
 RUN_TAXONOMY: yes
-TAXONOMY_DATABASE: SGB.Jan20 #SGB.Dec20
+TAXONOMY_NAME: phylophlan    # Currently, only phylophlan
 TAXONOMY_CPUS: 8
-TAXONOMY_memory: 5 
-DATABASE_FOLDER: /mypath/MIntO/data
+TAXONOMY_memory: 5
+
+# PHYLOPHLAN METAGENOMICS settings
+#
+TAXONOMY_DATABASE: SGB.Jan20
+TAXONOMY_DATABASE_FOLDER: /mypath/MIntO/data