Skip to content

Commit

Permalink
New format yaml files for the tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
microbiomix committed Jun 7, 2023
1 parent 8e378af commit 4e3d8b6
Show file tree
Hide file tree
Showing 11 changed files with 474 additions and 97 deletions.
17 changes: 12 additions & 5 deletions tutorial/configuration/data_integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,38 @@
PROJECT: IBD_tutorial
working_dir: /mypath/IBD_tutorial
omics: metaG_metaT
local_dir: /scratch/johndoe/IBD_tutorial
minto_dir: /server/apps/MIntO
local_dir: /tmp/user/
minto_dir: /mypath/MIntO
METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt

######################
# Program settings
######################
alignment_identity: 95
abundance_normalization: TPM
abundance_normalization: MG
map_reference: MAG
MIN_mapped_reads: 2

MERGE_threads: 4
MERGE_memory: 5

ANNOTATION_file:
MAG_omics: metaG

ANNOTATION_file:

# List annotation IDs matching to generate function profiles.
# If map_reference= 'MAG' or 'reference_genome', this list correspond to:
# 'eggNOG_OGs','KEGG_Pathway','KEGG_Module','KEGG_Module','PFAMs','dbCAN.mod' and 'dbCAN.enzclass.
# 'eggNOG_OGs','KEGG_Pathway','KEGG_Module','KEGG_KO','PFAMs','dbCAN.mod' and 'dbCAN.enzclass.
# The names should match the ANNOTATION_file column names.
# E.g.:
# - eggNOG_OGs
# - KEGG_Pathway
ANNOTATION_ids:
- eggNOG_OGs
- KEGG_Pathway
- KEGG_Module
- KEGG_KO
- PFAMs
- dbCAN.mod
- dbCAN.enzclass

4 changes: 2 additions & 2 deletions tutorial/configuration/metaG/QC_1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
PROJECT: IBD_tutorial
working_dir: /mypath/IBD_tutorial
omics: metaG
local_dir: /scratch/johndoe/IBD_tutorial
minto_dir: /server/apps/MIntO
local_dir: /tmp/user/
minto_dir: /mypath/MIntO
METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt
raw_reads_dir: /mypath/IBD_tutorial_raw/metaG

Expand Down
66 changes: 58 additions & 8 deletions tutorial/configuration/metaG/QC_2.yaml
Original file line number Diff line number Diff line change
@@ -1,39 +1,89 @@
######################
# General settings
######################

PROJECT: IBD_tutorial
working_dir: /mypath/IBD_tutorial
omics: metaG
local_dir: /scratch/johndoe/IBD_tutorial
minto_dir: /server/apps/MIntO
local_dir: /tmp/user/
minto_dir: /mypath/MIntO
METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt

######################
########################################
# Program settings
######################
########################################

#########################
# Read length filtering
TRIMMOMATIC_threads: 5
TRIMMOMATIC_memory: 4
TRIMMOMATIC_minlen: 134
#########################

READ_minlen: 134

#########################
# Host genome filtering
#########################

# bwa-mem2 index files will be stored at: <PATH_host_genome>/BWA_index/<NAME_host_genome>.*
# If it already exists, then it will be used directly.
# If not, a fasta file should exist as: <PATH_host_genome>/<NAME_host_genome>
# This will be build into index files using:
# bwa-mem2 index -p <PATH_host_genome>/BWA_index/<NAME_host_genome> <PATH_host_genome>/<NAME_host_genome>
# Please do not use '.fasta' or '.fa' or '.fna' extension for the fasta file. Name it without extension.

PATH_host_genome: /mypath/IBD_tutorial
NAME_host_genome: build_hg18_subset.fna
BWA_index_host_memory: 40
BWA_host_threads: 8
BWA_host_memory: 40

##################################
# Assembly-free taxonomy profiling
##################################

# Following values for 'taxa_profile' are supported:
# 1. metaphlan - relative abundance using MetaPhlAn
# 2. motus_raw - read counts using mOTUs
# 3. motus_rel - relative abundance using mOTUs
# Comma-delimited combination of multiple options also supported
# Eg:
# taxa_profile: metaphlan,motus_rel
TAXA_threads: 8
TAXA_memory: 10
taxa_profile: motus_rel
taxa_profile: motus_rel

#####################
# Analysis parameters
#####################

# MAIN_factor - the main factor in the metadata file to differentiate in visualization (using color)
# PLOT_factor2 - the second factor in the metadata file to differentiate in visualization (using shape)
# PLOT_time - name of the factor in the metadata file denoting time (e.g. hour, day)

MAIN_factor: participant_ID2
PLOT_factor2:
PLOT_time: week_n

######################
# Optionally, do you want to merge replicates or make pseudo samples
# E.g:
# MERGE_ILLUMINA_SAMPLES:
# - merged=rep1+rep2+rep3
#
# The above directive will combine 3 samples (rep1, rep2 and rep3)
# after the last step into a new sample called 'merged'. Now you can remove
# rep1, rep2 and rep3 from assembly, MAG generation and profiling steps.
# Please note that METADATA file must have an entry for 'merged' as well,
# otherwise QC_2 step will fail.
# Having extra entries in METADATA file does not affect you in any way.
######################

#MERGE_ILLUMINA_SAMPLES:


######################
# Input data
######################

# ILLUMINA section:
# -----------------
# List of illumina samples that will be filtered by read length.
Expand Down
83 changes: 57 additions & 26 deletions tutorial/configuration/metaG/assembly.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,35 @@
# General settings
######################
PROJECT: IBD_tutorial
working_dir: /mypath/IBD_tutorial_output
working_dir: /mypath/IBD_tutorial
omics: metaG
local_dir: /mypath/tmp/IBD_tutorial_metaG
local_dir: /tmp/user/
minto_dir: /mypath/MIntO
METADATA: /mypath/MIntO/tutorial/metadata/tutorial_metadata.txt
METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt

######################
# Program settings
######################
# MetaSPAdes settings
#
METASPADES_qoffset: auto
METASPADES_threads: 10
METASPADES_threads: 16
METASPADES_memory: 8
METASPADES_hybrid_max_k: 99
METASPADES_illumina_max_k: 99

# MEGAHIT settings
#
# Total used memory is not given here but calculated to be 10G per sample in the coassembly.
# Default parameter is 5 for metaG samples with 10M reads.
# Please make sure that there is enough RAM on the server.
MEGAHIT_threads: 10
MEGAHIT_memory: 1
# Note on MEGAHIT_memory:
# MEGAHIT's memory requirement scales with sample count and sample size.
# Default MEGAHIT_memory below is 10 Gigabytes per sample (plenty for metaG samples of 10M reads).
# MEGAHIT_memory parameter represents per-sample memory in Gigabytes
# during the first attempt. If it is not enough and MEGAHIT fails, then
# each successive attempt will increase per-sample memory by 6 Gibabytes
# until it succeeds or reaches max_attempts from snakemake.
# Please make sure that there is enough RAM on the server.
MEGAHIT_memory: 10
MEGAHIT_threads: 32
MEGAHIT_presets:
- meta-sensitive
- meta-large
Expand All @@ -42,24 +47,49 @@ MEGAHIT_presets:
# want one parameter to be used, please comment everything else.
# 2. If nothing is listed here, then MetaFlye won't be run.
# If you just want our default parameters above, then here is a possible option:
# metaflye-default:
# metaflye-default: ""
# 3. 'tres-o3000-3x' is valid for flye 2.8.3. From 2.9.x, --plasmids and --trestle are
# not valid. So please use valid options if you are using newer versions of flye.
METAFLYE_presets:
tres-o3000-3x: --min-overlap 3000 --iterations 3
#metaflye-default:
#metaflye-default: ""

# BWA settings
# Used when mapping reads back to contigs
#
BWA_threads: 8
BWA_threads: 24

# samtools settings
# Used when sorting bam files
#
SAMTOOLS_sort_threads: 4
SAMTOOLS_sort_memory_gb: 20

###############################
# Binning preparation settings
###############################

# Whether to use contigs or scaffolds from SPAdes
SPADES_CONTIGS_OR_SCAFFOLDS: scaffolds

# minimum contig/scaffold fasta length
MIN_FASTA_LENGTH: 2500

# assembly batch size for mapping reads to combined contig sets
CONTIG_MAPPING_BATCH_SIZE: 100

# Should we exclude any assembly type during MAG generation?
# E.g., if you make MAGs from metaT, individual sample assemblies
# will be quite fragmented. If you have several co-assemblies, then
# ignoring 'illumina_single' might improve your MAG quality. This can
# be achieved using:
# ---
# EXCLUDE_ASSEMBLY_TYPES:
# - illumina_single
# ---
#
# EXCLUDE_ASSEMBLY_TYPES:

# Input data

# HYBRID section:
Expand All @@ -70,7 +100,7 @@ SAMTOOLS_sort_memory_gb: 20
# Hybrid assemblies will be performed for each combination of nanopore and illumina samples.
# E.g.:
#
# N1: I3+I4
# N1: I3+I4
#
# The above will result in 2 hybrid assemblies: 'N1-I3' and 'N1-I4'
#
Expand All @@ -86,19 +116,6 @@ SAMTOOLS_sort_memory_gb: 20
#
#HYBRID:

# COASSEMBLY section:
# -------------------
# MEGAHIT coassembly will be performed using the following definitions.
# Each coassembly is named in the LHS, and corresponding illumina sample(s) are in RHS (delimited by '+').
# One coassembly will be performed for each line.
# E.g. 'Subject1: I3+I4' will result in 1 coassembly: 'Subject1' using I3 and I4 data.
# Memory per coassembly is calculated to be 10G per sample in the coassembly.
# Please make sure that there is enough RAM on the server.
#
COASSEMBLY:
Full: CD136+CD138+CD140+CD142+CD146+CD237+CD238+CD240+CD242+CD244
CD1: CD136+CD138+CD140+CD142+CD146
CD2: CD237+CD238+CD240+CD242+CD244
# NANOPORE section:
# -----------------
# List of nanopore samples that will be assembled individually using MetaFlye.
Expand All @@ -124,3 +141,17 @@ ILLUMINA:
- CD240
- CD242
- CD244

# COASSEMBLY section:
# -------------------
# MEGAHIT coassembly will be performed using the following definitions.
# Each coassembly is named in the LHS, and corresponding illumina sample(s) are in RHS (delimited by '+').
# One coassembly will be performed for each line.
# E.g. 'Subject1: I3+I4' will result in 1 coassembly: 'Subject1' using I3 and I4 data.
# Memory per coassembly is calculated to be 10G per sample in the coassembly.
# Please make sure that there is enough RAM on the server.
#
COASSEMBLY:
Full: CD136+CD138+CD140+CD142+CD146+CD237+CD238+CD240+CD242+CD244
CD1: CD136+CD138+CD140+CD142+CD146
CD2: CD237+CD238+CD240+CD242+CD244
48 changes: 24 additions & 24 deletions tutorial/configuration/metaG/mags_generation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,42 @@
# General settings
######################
PROJECT: IBD_tutorial
working_dir: /mypath/IBD_tutorial_output
working_dir: /mypath/IBD_tutorial
omics: metaG
local_dir: /mypath/tmp/IBD_tutorial_metaG
local_dir: /tmp/user/
minto_dir: /mypath/MIntO
METADATA: /mypath/MIntO/tutorial/metadata/tutorial_metadata.txt
METADATA: /mypath/IBD_tutorial/tutorial_metadata.txt

######################
# Program settings
######################
# COMMON PARAMETERS
#
MIN_FASTA_LENGTH: 500000
MIN_FASTA_LENGTH: 2500
MIN_MAG_LENGTH: 500000
BINSPLIT_CHAR: _

# VAMB settings
#
BINNERS:
- vamb_256
- vamb_384
- vamb_512
- vamb_768
- aaey
- aaez
- vae384

VAMB_THREADS: 8
VAMB_memory: 1
VAMB_THREADS: 24
VAMB_memory: 20

# Use GPU in VAMB:
# could be yes or not
# could be yes or no
VAMB_GPU: no


# CHECKM settings
#
CHECKM_THREADS: 8
CHECKM_memory: 5

# checkm threshold
CHECKM_COMPLETENESS: 90 #higher than this
CHECKM_COMPLETENESS: 90 # higher than this
CHECKM_CONTAMINATION: 5 # lower than this

# Clean up checkm files?
CLEAN_CHECKM: yes # can be yes or not
CHECKM_BATCH_SIZE: 50 # Process MAGs with this batch size
CHECKM_DATABASE: /mypath/MIntO/data/CheckM2_database/uniref100.KO.1.dmnd

# COVERM settings
#
Expand All @@ -51,7 +47,7 @@ COVERM_memory: 5
# SCORING THE BEST GENOMES settings
#
# this could be checkm or genome
SCORE_METHOD: checkm
SCORE_METHOD: checkm


# PROKKA settings
Expand All @@ -60,10 +56,14 @@ RUN_PROKKA: yes
PROKKA_CPUS: 8
PROKKA_memory: 5

# PHYLOPHLAN METAGENOMICS settings
# MAG taxonomy settings
#
RUN_TAXONOMY: yes
TAXONOMY_DATABASE: SGB.Jan20 #SGB.Dec20
TAXONOMY_NAME: phylophlan # Currently, only phylophlan
TAXONOMY_CPUS: 8
TAXONOMY_memory: 5
DATABASE_FOLDER: /mypath/MIntO/data
TAXONOMY_memory: 5

# PHYLOPHLAN METAGENOMICS settings
#
TAXONOMY_DATABASE: SGB.Jan20
TAXONOMY_DATABASE_FOLDER: /mypath/MIntO/data
Loading

0 comments on commit 4e3d8b6

Please sign in to comment.