diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 081176b48..b2ff9711c 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -27,34 +27,34 @@ task assemble { assembly.py --version | tee VERSION - if [[ "${assembler}" == "spades" ]]; then + if [[ "~{assembler}" == "spades" ]]; then assembly.py assemble_spades \ - ${reads_unmapped_bam} \ - ${trim_clip_db} \ - ${sample_name}.assembly1-${assembler}.fasta \ - ${'--nReads=' + spades_n_reads} \ - ${true="--alwaysSucceed" false="" always_succeed} \ - ${'--minContigLen=' + spades_min_contig_len} \ + ~{reads_unmapped_bam} \ + ~{trim_clip_db} \ + ~{sample_name}.assembly1-~{assembler}.fasta \ + ~{'--nReads=' + spades_n_reads} \ + ~{true="--alwaysSucceed" false="" always_succeed} \ + ~{'--minContigLen=' + spades_min_contig_len} \ --memLimitGb $mem_in_gb \ - --outReads=${sample_name}.subsamp.bam \ + --outReads=~{sample_name}.subsamp.bam \ --loglevel=DEBUG else - echo "unrecognized assembler ${assembler}" >&2 + echo "unrecognized assembler ~{assembler}" >&2 exit 1 fi - samtools view -c ${sample_name}.subsamp.bam | tee subsample_read_count >&2 + samtools view -c ~{sample_name}.subsamp.bam | tee subsample_read_count >&2 } output { - File contigs_fasta = "${sample_name}.assembly1-${assembler}.fasta" - File subsampBam = "${sample_name}.subsamp.bam" + File contigs_fasta = "~{sample_name}.assembly1-~{assembler}.fasta" + File subsampBam = "~{sample_name}.subsamp.bam" Int subsample_read_count = read_int("subsample_read_count") String viralngs_version = read_string("VERSION") } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 15]) + " GB" cpu: 4 disks: "local-disk 375 LOCAL" @@ -96,53 +96,53 @@ task scaffold { assembly.py --version | tee VERSION assembly.py order_and_orient \ - ${contigs_fasta} \ - ${sep=' ' reference_genome_fasta} \ - ${sample_name}.intermediate_scaffold.fasta \ - ${'--maxgap=' + nucmer_max_gap} \ - ${'--minmatch=' + nucmer_min_match} \ - ${'--mincluster=' + nucmer_min_cluster} \ - ${'--min_pct_contig_aligned=' + scaffold_min_pct_contig_aligned} \ - --outReference ${sample_name}.scaffolding_chosen_ref.fasta \ - --outStats ${sample_name}.scaffolding_stats.txt \ - --outAlternateContigs ${sample_name}.scaffolding_alt_contigs.fasta \ + ~{contigs_fasta} \ + ~{sep=' ' reference_genome_fasta} \ + ~{sample_name}.intermediate_scaffold.fasta \ + ~{'--maxgap=' + nucmer_max_gap} \ + ~{'--minmatch=' + nucmer_min_match} \ + ~{'--mincluster=' + nucmer_min_cluster} \ + ~{'--min_pct_contig_aligned=' + scaffold_min_pct_contig_aligned} \ + --outReference ~{sample_name}.scaffolding_chosen_ref.fasta \ + --outStats ~{sample_name}.scaffolding_stats.txt \ + --outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \ --loglevel=DEBUG - grep '^>' ${sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ${sample_name}.scaffolding_chosen_ref.txt + grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ~{sample_name}.scaffolding_chosen_ref.txt assembly.py gapfill_gap2seq \ - ${sample_name}.intermediate_scaffold.fasta \ - ${reads_bam} \ - ${sample_name}.intermediate_gapfill.fasta \ + ~{sample_name}.intermediate_scaffold.fasta \ + ~{reads_bam} \ + ~{sample_name}.intermediate_gapfill.fasta \ --memLimitGb $mem_in_gb \ --maskErrors \ --loglevel=DEBUG - grep -v '^>' ${sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length - grep -v '^>' ${sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous + grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length + grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous assembly.py impute_from_reference \ - ${sample_name}.intermediate_gapfill.fasta \ - ${sample_name}.scaffolding_chosen_ref.fasta \ - ${sample_name}.scaffolded_imputed.fasta \ - --newName ${sample_name} \ - ${'--replaceLength=' + replace_length} \ - ${'--minLengthFraction=' + min_length_fraction} \ - ${'--minUnambig=' + min_unambig} \ - ${'--aligner=' + aligner} \ + ~{sample_name}.intermediate_gapfill.fasta \ + ~{sample_name}.scaffolding_chosen_ref.fasta \ + ~{sample_name}.scaffolded_imputed.fasta \ + --newName ~{sample_name} \ + ~{'--replaceLength=' + replace_length} \ + ~{'--minLengthFraction=' + min_length_fraction} \ + ~{'--minUnambig=' + min_unambig} \ + ~{'--aligner=' + aligner} \ --loglevel=DEBUG } output { - File scaffold_fasta = "${sample_name}.scaffolded_imputed.fasta" - File intermediate_scaffold_fasta = "${sample_name}.intermediate_scaffold.fasta" - File intermediate_gapfill_fasta = "${sample_name}.intermediate_gapfill.fasta" + File scaffold_fasta = "~{sample_name}.scaffolded_imputed.fasta" + File intermediate_scaffold_fasta = "~{sample_name}.intermediate_scaffold.fasta" + File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta" Int assembly_preimpute_length = read_int("assembly_preimpute_length") Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous") - String scaffolding_chosen_ref_name = read_string("${sample_name}.scaffolding_chosen_ref.txt") - File scaffolding_chosen_ref = "${sample_name}.scaffolding_chosen_ref.fasta" - File scaffolding_stats = "${sample_name}.scaffolding_stats.txt" - File scaffolding_alt_contigs = "${sample_name}.scaffolding_alt_contigs.fasta" + String scaffolding_chosen_ref_name = read_string("~{sample_name}.scaffolding_chosen_ref.txt") + File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta" + File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt" + File scaffolding_alt_contigs = "~{sample_name}.scaffolding_alt_contigs.fasta" String viralngs_version = read_string("VERSION") } @@ -185,18 +185,18 @@ task ivar_trim { command { ivar version | head -1 | tee VERSION - if [ -f "${trim_coords_bed}" ]; then + if [ -f "~{trim_coords_bed}" ]; then ivar trim -e \ - ${'-b ' + trim_coords_bed} \ - ${'-m ' + min_keep_length} \ - ${'-s ' + sliding_window} \ - ${'-q ' + min_quality} \ - ${'-x ' + primer_offset} \ - -i ${aligned_bam} -p trim | tee IVAR_OUT - samtools sort -@ $(nproc) -m 1000M -o ${bam_basename}.trimmed.bam trim.bam + ~{'-b ' + trim_coords_bed} \ + ~{'-m ' + min_keep_length} \ + ~{'-s ' + sliding_window} \ + ~{'-q ' + min_quality} \ + ~{'-x ' + primer_offset} \ + -i ~{aligned_bam} -p trim | tee IVAR_OUT + samtools sort -@ $(nproc) -m 1000M -o ~{bam_basename}.trimmed.bam trim.bam else echo "skipping ivar trim" - cp "${aligned_bam}" "${bam_basename}.trimmed.bam" + cp "~{aligned_bam}" "~{bam_basename}.trimmed.bam" echo "Trimmed primers from 0% (0) of reads." > IVAR_OUT fi PCT=$(grep "Trimmed primers from" IVAR_OUT | perl -lape 's/Trimmed primers from (\S+)%.*/$1/') @@ -205,14 +205,14 @@ task ivar_trim { } output { - File aligned_trimmed_bam = "${bam_basename}.trimmed.bam" + File aligned_trimmed_bam = "~{bam_basename}.trimmed.bam" Float primer_trimmed_read_percent = read_float("IVAR_TRIM_PCT") Int primer_trimmed_read_count = read_int("IVAR_TRIM_COUNT") String ivar_version = read_string("VERSION") } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 7]) + " GB" cpu: 4 disks: "local-disk 375 LOCAL" @@ -270,7 +270,7 @@ task ivar_trim_stats { } runtime { - docker: "${docker}" + docker: docker memory: "1 GB" cpu: 1 disks: "local-disk 50 HDD" @@ -304,77 +304,77 @@ task align_reads { aligner: { description: "Short read aligner to use: novoalign, minimap2, or bwa. (Default: novoalign)" } } - command { + command <<< set -ex # do not set pipefail, since grep exits 1 if it can't find the pattern read_utils.py --version | tee VERSION mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90) - cp ${reference_fasta} assembly.fasta + cp "~{reference_fasta}" assembly.fasta grep -v '^>' assembly.fasta | tr -d '\n' | wc -c | tee assembly_length if [ "$(cat assembly_length)" != "0" ]; then # only perform the following if the reference is non-empty - if [ "${aligner}" == "novoalign" ]; then + if [ "~{aligner}" == "novoalign" ]; then read_utils.py novoindex \ assembly.fasta \ - ${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ + ~{"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ --loglevel=DEBUG fi read_utils.py index_fasta_picard assembly.fasta --loglevel=DEBUG read_utils.py index_fasta_samtools assembly.fasta --loglevel=DEBUG read_utils.py align_and_fix \ - ${reads_unmapped_bam} \ + "~{reads_unmapped_bam}" \ assembly.fasta \ - --outBamAll "${sample_name}.all.bam" \ - --outBamFiltered "${sample_name}.mapped.bam" \ - --aligner ${aligner} \ - ${'--aligner_options "' + aligner_options + '"'} \ - ${true='--skipMarkDupes' false="" skip_mark_dupes} \ + --outBamAll "~{sample_name}.all.bam" \ + --outBamFiltered "~{sample_name}.mapped.bam" \ + --aligner ~{aligner} \ + ~{'--aligner_options "' + aligner_options + '"'} \ + ~{true='--skipMarkDupes' false="" skip_mark_dupes} \ --JVMmemory "$mem_in_mb"m \ - ${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ + ~{"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ --loglevel=DEBUG else # handle special case of empty reference fasta -- emit empty bams (with original bam headers) - samtools view -H -b "${reads_unmapped_bam}" > "${sample_name}.all.bam" - samtools view -H -b "${reads_unmapped_bam}" > "${sample_name}.mapped.bam" + samtools view -H -b "~{reads_unmapped_bam}" > "~{sample_name}.all.bam" + samtools view -H -b "~{reads_unmapped_bam}" > "~{sample_name}.mapped.bam" - samtools index "${sample_name}.all.bam" "${sample_name}.all.bai" - samtools index "${sample_name}.mapped.bam" "${sample_name}.mapped.bai" + samtools index "~{sample_name}.all.bam" "~{sample_name}.all.bai" + samtools index "~{sample_name}.mapped.bam" "~{sample_name}.mapped.bai" fi cat /proc/loadavg > CPU_LOAD # collect figures of merit grep -v '^>' assembly.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous - samtools view -c ${reads_unmapped_bam} | tee reads_provided - samtools view -c ${sample_name}.mapped.bam | tee reads_aligned + samtools view -c "~{reads_unmapped_bam}" | tee reads_provided + samtools view -c "~{sample_name}.mapped.bam" | tee reads_aligned # report only primary alignments 260=exclude unaligned reads and secondary mappings - samtools view -h -F 260 ${sample_name}.all.bam | samtools flagstat - | tee ${sample_name}.all.bam.flagstat.txt - grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned - samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned + samtools view -h -F 260 "~{sample_name}.all.bam" | samtools flagstat - | tee ~{sample_name}.all.bam.flagstat.txt + grep properly "~{sample_name}.all.bam.flagstat.txt" | cut -f 1 -d ' ' | tee read_pairs_aligned + samtools view "~{sample_name}.mapped.bam" | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length_unambiguous)") if "$(cat assembly_length_unambiguous)">0 else print(0)" > mean_coverage # fastqc mapped bam - reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip + reports.py fastqc ~{sample_name}.mapped.bam ~{sample_name}.mapped_fastqc.html --out_zip ~{sample_name}.mapped_fastqc.zip cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> output { - File aligned_bam = "${sample_name}.all.bam" - File aligned_bam_idx = "${sample_name}.all.bai" - File aligned_bam_flagstat = "${sample_name}.all.bam.flagstat.txt" - File aligned_only_reads_bam = "${sample_name}.mapped.bam" - File aligned_only_reads_bam_idx = "${sample_name}.mapped.bai" - File aligned_only_reads_fastqc = "${sample_name}.mapped_fastqc.html" - File aligned_only_reads_fastqc_zip = "${sample_name}.mapped_fastqc.zip" + File aligned_bam = "~{sample_name}.all.bam" + File aligned_bam_idx = "~{sample_name}.all.bai" + File aligned_bam_flagstat = "~{sample_name}.all.bam.flagstat.txt" + File aligned_only_reads_bam = "~{sample_name}.mapped.bam" + File aligned_only_reads_bam_idx = "~{sample_name}.mapped.bai" + File aligned_only_reads_fastqc = "~{sample_name}.mapped_fastqc.html" + File aligned_only_reads_fastqc_zip = "~{sample_name}.mapped_fastqc.zip" Int reads_provided = read_int("reads_provided") Int reads_aligned = read_int("reads_aligned") Int read_pairs_aligned = read_int("read_pairs_aligned") @@ -387,7 +387,7 @@ task align_reads { } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 15]) + " GB" cpu: 8 disks: "local-disk 375 LOCAL" @@ -432,26 +432,26 @@ task refine_assembly_with_aligned_reads { assembly.py --version | tee VERSION - if [ ${true='true' false='false' mark_duplicates} == "true" ]; then + if [ ~{true='true' false='false' mark_duplicates} == "true" ]; then read_utils.py mkdup_picard \ - ${reads_aligned_bam} \ + ~{reads_aligned_bam} \ temp_markdup.bam \ --JVMmemory "$mem_in_mb"m \ --loglevel=DEBUG else - ln -s ${reads_aligned_bam} temp_markdup.bam + ln -s ~{reads_aligned_bam} temp_markdup.bam fi samtools index -@ $(nproc) temp_markdup.bam temp_markdup.bai - ln -s ${reference_fasta} assembly.fasta + ln -s ~{reference_fasta} assembly.fasta assembly.py refine_assembly \ assembly.fasta \ temp_markdup.bam \ refined.fasta \ --already_realigned_bam=temp_markdup.bam \ - --outVcf ${sample_name}.sites.vcf.gz \ - --min_coverage ${min_coverage} \ - --major_cutoff ${major_cutoff} \ + --outVcf ~{sample_name}.sites.vcf.gz \ + --min_coverage ~{min_coverage} \ + --major_cutoff ~{major_cutoff} \ --JVMmemory "$mem_in_mb"m \ --loglevel=DEBUG @@ -487,7 +487,7 @@ task refine_assembly_with_aligned_reads { } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 15]) + " GB" cpu: 8 disks: "local-disk 375 LOCAL" @@ -551,7 +551,7 @@ task refine { } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 7]) + " GB" cpu: 8 disks: "local-disk 375 LOCAL" @@ -597,95 +597,95 @@ task refine_2x_and_plot { assembly.py --version | tee VERSION - ln -s ${assembly_fasta} assembly.fasta + ln -s ~{assembly_fasta} assembly.fasta read_utils.py novoindex \ assembly.fasta \ - ${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ + ~{"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ --loglevel=DEBUG # refine 1 assembly.py refine_assembly \ assembly.fasta \ - ${reads_unmapped_bam} \ - ${sample_name}.refine1.fasta \ - --outVcf ${sample_name}.refine1.pre_fasta.vcf.gz \ - --min_coverage ${refine1_min_coverage} \ - --major_cutoff ${refine1_major_cutoff} \ - --novo_params="${refine1_novoalign_options}" \ + ~{reads_unmapped_bam} \ + ~{sample_name}.refine1.fasta \ + --outVcf ~{sample_name}.refine1.pre_fasta.vcf.gz \ + --min_coverage ~{refine1_min_coverage} \ + --major_cutoff ~{refine1_major_cutoff} \ + --novo_params="~{refine1_novoalign_options}" \ --JVMmemory "$mem_in_mb"m \ - ${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ + ~{"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ --loglevel=DEBUG # refine 2 assembly.py refine_assembly \ - ${sample_name}.refine1.fasta \ - ${reads_unmapped_bam} \ - ${sample_name}.fasta \ - --outVcf ${sample_name}.refine2.pre_fasta.vcf.gz \ - --min_coverage ${refine2_min_coverage} \ - --major_cutoff ${refine2_major_cutoff} \ - --novo_params="${refine2_novoalign_options}" \ - ${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ + ~{sample_name}.refine1.fasta \ + ~{reads_unmapped_bam} \ + ~{sample_name}.fasta \ + --outVcf ~{sample_name}.refine2.pre_fasta.vcf.gz \ + --min_coverage ~{refine2_min_coverage} \ + --major_cutoff ~{refine2_major_cutoff} \ + --novo_params="~{refine2_novoalign_options}" \ + ~{"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ --JVMmemory "$mem_in_mb"m \ --loglevel=DEBUG # final alignment read_utils.py align_and_fix \ - ${reads_unmapped_bam} \ - ${sample_name}.fasta \ - --outBamAll ${sample_name}.all.bam \ - --outBamFiltered ${sample_name}.mapped.bam \ - --aligner_options "${plot_coverage_novoalign_options}" \ + ~{reads_unmapped_bam} \ + ~{sample_name}.fasta \ + --outBamAll ~{sample_name}.all.bam \ + --outBamFiltered ~{sample_name}.mapped.bam \ + --aligner_options "~{plot_coverage_novoalign_options}" \ --JVMmemory "$mem_in_mb"m \ - ${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ + ~{"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \ --loglevel=DEBUG # collect figures of merit set +o pipefail # grep will exit 1 if it fails to find the pattern - grep -v '^>' ${sample_name}.fasta | tr -d '\n' | wc -c | tee assembly_length - grep -v '^>' ${sample_name}.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous - samtools view -c ${sample_name}.mapped.bam | tee reads_aligned + grep -v '^>' ~{sample_name}.fasta | tr -d '\n' | wc -c | tee assembly_length + grep -v '^>' ~{sample_name}.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous + samtools view -c ~{sample_name}.mapped.bam | tee reads_aligned # report only primary alignments 260=exclude unaligned reads and secondary mappings - samtools view -h -F 260 ${sample_name}.all.bam | samtools flagstat - | tee ${sample_name}.all.bam.flagstat.txt - grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned - samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned + samtools view -h -F 260 ~{sample_name}.all.bam | samtools flagstat - | tee ~{sample_name}.all.bam.flagstat.txt + grep properly ~{sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned + samtools view ~{sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned #echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage python -c "print (float("$(cat bases_aligned)")/"$(cat assembly_length)") if "$(cat assembly_length)">0 else print(0)" > mean_coverage # fastqc mapped bam - reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html --out_zip ${sample_name}.mapped_fastqc.zip + reports.py fastqc ~{sample_name}.mapped.bam ~{sample_name}.mapped_fastqc.html --out_zip ~{sample_name}.mapped_fastqc.zip # plot coverage if [ $(cat reads_aligned) != 0 ]; then reports.py plot_coverage \ - ${sample_name}.mapped.bam \ - ${sample_name}.coverage_plot.pdf \ - --outSummary "${sample_name}.coverage_plot.txt" \ + ~{sample_name}.mapped.bam \ + ~{sample_name}.coverage_plot.pdf \ + --outSummary "~{sample_name}.coverage_plot.txt" \ --plotFormat pdf \ --plotWidth 1100 \ --plotHeight 850 \ --plotDPI 100 \ - --plotTitle "${sample_name} coverage plot" \ + --plotTitle "~{sample_name} coverage plot" \ --loglevel=DEBUG else - touch ${sample_name}.coverage_plot.pdf ${sample_name}.coverage_plot.txt + touch ~{sample_name}.coverage_plot.pdf ~{sample_name}.coverage_plot.txt fi } output { - File refine1_sites_vcf_gz = "${sample_name}.refine1.pre_fasta.vcf.gz" - File refine1_assembly_fasta = "${sample_name}.refine1.fasta" - File refine2_sites_vcf_gz = "${sample_name}.refine2.pre_fasta.vcf.gz" - File final_assembly_fasta = "${sample_name}.fasta" - File aligned_bam = "${sample_name}.all.bam" - File aligned_bam_idx = "${sample_name}.all.bai" - File aligned_bam_flagstat = "${sample_name}.all.bam.flagstat.txt" - File aligned_only_reads_bam = "${sample_name}.mapped.bam" - File aligned_only_reads_bam_idx = "${sample_name}.mapped.bai" - File aligned_only_reads_fastqc = "${sample_name}.mapped_fastqc.html" - File aligned_only_reads_fastqc_zip = "${sample_name}.mapped_fastqc.zip" - File coverage_plot = "${sample_name}.coverage_plot.pdf" - File coverage_tsv = "${sample_name}.coverage_plot.txt" + File refine1_sites_vcf_gz = "~{sample_name}.refine1.pre_fasta.vcf.gz" + File refine1_assembly_fasta = "~{sample_name}.refine1.fasta" + File refine2_sites_vcf_gz = "~{sample_name}.refine2.pre_fasta.vcf.gz" + File final_assembly_fasta = "~{sample_name}.fasta" + File aligned_bam = "~{sample_name}.all.bam" + File aligned_bam_idx = "~{sample_name}.all.bai" + File aligned_bam_flagstat = "~{sample_name}.all.bam.flagstat.txt" + File aligned_only_reads_bam = "~{sample_name}.mapped.bam" + File aligned_only_reads_bam_idx = "~{sample_name}.mapped.bai" + File aligned_only_reads_fastqc = "~{sample_name}.mapped_fastqc.html" + File aligned_only_reads_fastqc_zip = "~{sample_name}.mapped_fastqc.zip" + File coverage_plot = "~{sample_name}.coverage_plot.pdf" + File coverage_tsv = "~{sample_name}.coverage_plot.txt" Int assembly_length = read_int("assembly_length") Int assembly_length_unambiguous = read_int("assembly_length_unambiguous") Int reads_aligned = read_int("reads_aligned") @@ -696,7 +696,7 @@ task refine_2x_and_plot { } runtime { - docker: "${docker}" + docker: docker memory: select_first([machine_mem_gb, 7]) + " GB" cpu: 8 disks: "local-disk 375 LOCAL" @@ -727,7 +727,7 @@ task run_discordance { # create 2-col table with read group ids in both cols python3 < filtered.vcf - cat filtered.vcf | bcftools filter -i "MAC>0" > "${out_basename}.discordant.vcf" + cat everything.vcf | bcftools filter -e "FMT/DP<~{min_coverage}" -S . > filtered.vcf + cat filtered.vcf | bcftools filter -i "MAC>0" > "~{out_basename}.discordant.vcf" # tally outputs bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant - bcftools filter -i 'TYPE="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps - bcftools filter -i 'TYPE!="snp"' "${out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels + bcftools filter -i 'TYPE="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps + bcftools filter -i 'TYPE!="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels } output { - File discordant_sites_vcf = "${out_basename}.discordant.vcf" + File discordant_sites_vcf = "~{out_basename}.discordant.vcf" Int concordant_sites = read_int("num_concordant") Int discordant_snps = read_int("num_discordant_snps") Int discordant_indels = read_int("num_discordant_indels") @@ -771,7 +771,7 @@ task run_discordance { } runtime { - docker: "${docker}" + docker: docker memory: "3 GB" cpu: 2 disks: "local-disk 100 HDD" diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl index e2cbd5cae..ed392824a 100644 --- a/pipes/WDL/tasks/tasks_demux.wdl +++ b/pipes/WDL/tasks/tasks_demux.wdl @@ -361,7 +361,8 @@ task illumina_demux { cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES + set +o pipefail + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES >>> output { diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index 315087245..692f78cc4 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -151,7 +151,7 @@ task nextclade_many_samples { # gather runtime metrics cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES >>> runtime { docker: docker @@ -523,7 +523,7 @@ task nextstrain_build_subsample { cd .. cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES >>> runtime { docker: docker @@ -725,7 +725,7 @@ task filter_subsample_sequences { } } String out_fname = sub(sub(basename(sequences_fasta), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta") - command { + command <<< set -e augur version > VERSION @@ -764,8 +764,8 @@ task filter_subsample_sequences { grep "strains passed all filters" STDOUT | cut -f 1 -d ' ' > OUT_COUNT cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "15 GB" @@ -858,7 +858,7 @@ task filter_sequences_to_list { cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES >>> runtime { docker: docker @@ -897,7 +897,7 @@ task mafft_one_chr { Int mem_size = 500 Int cpus = 64 } - command { + command <<< set -e # decompress sequences if necessary @@ -947,8 +947,8 @@ task mafft_one_chr { # profiling and stats cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: mem_size + " GB" @@ -983,7 +983,7 @@ task mafft_one_chr_chunked { Int mem_size = 32 Int cpus = 96 } - command { + command <<< set -e # write out ref @@ -1053,8 +1053,8 @@ task mafft_one_chr_chunked { # profiling and stats cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: mem_size + " GB" @@ -1087,7 +1087,7 @@ task augur_mafft_align { String docker = "nextstrain/base:build-20211012T204409Z" } - command { + command <<< set -e augur version > VERSION augur align --sequences "~{sequences}" \ @@ -1100,8 +1100,8 @@ task augur_mafft_align { --nthreads auto cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "180 GB" @@ -1163,7 +1163,7 @@ task augur_mask_sites { } } String out_fname = sub(sub(basename(sequences), ".vcf", ".masked.vcf"), ".fasta$", ".masked.fasta") - command { + command <<< set -e augur version > VERSION BEDFILE=~{select_first([mask_bed, "/dev/null"])} @@ -1176,8 +1176,8 @@ task augur_mask_sites { fi cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "3 GB" @@ -1219,7 +1219,7 @@ task draft_augur_tree { } } String out_basename = basename(basename(basename(msa_or_vcf, '.gz'), '.vcf'), '.fasta') - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur tree --alignment "~{msa_or_vcf}" \ @@ -1232,8 +1232,8 @@ task draft_augur_tree { --nthreads auto cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "32 GB" @@ -1286,7 +1286,7 @@ task refine_augur_tree { } } String out_basename = basename(basename(basename(msa_or_vcf, '.gz'), '.vcf'), '.fasta') - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur refine \ @@ -1313,8 +1313,8 @@ task refine_augur_tree { ~{"--vcf-reference " + vcf_reference} cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "50 GB" @@ -1350,7 +1350,7 @@ task ancestral_traits { String docker = "nextstrain/base:build-20211012T204409Z" } String out_basename = basename(tree, '.nwk') - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur traits \ @@ -1363,8 +1363,8 @@ task ancestral_traits { ~{true="--confidence" false="" confidence} cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "32 GB" @@ -1407,7 +1407,7 @@ task ancestral_tree { } } String out_basename = basename(basename(basename(msa_or_vcf, '.gz'), '.vcf'), '.fasta') - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur ancestral \ @@ -1423,8 +1423,8 @@ task ancestral_tree { ~{true="--infer-ambiguous" false="" infer_ambiguous} cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "50 GB" @@ -1460,7 +1460,7 @@ task translate_augur_tree { String docker = "nextstrain/base:build-20211012T204409Z" } String out_basename = basename(tree, '.nwk') - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur translate --tree "~{tree}" \ @@ -1470,8 +1470,8 @@ task translate_augur_tree { ~{"--vcf-reference " + vcf_reference} \ ~{"--genes " + genes} \ --output-node-data ~{out_basename}_aa_muts.json - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "2 GB" @@ -1515,7 +1515,7 @@ task tip_frequencies { String docker = "nextstrain/base:build-20211012T204409Z" String out_basename = basename(tree, '.nwk') } - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur frequencies \ @@ -1539,8 +1539,8 @@ task tip_frequencies { cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: select_first([machine_mem_gb, 30]) + " GB" @@ -1573,7 +1573,7 @@ task assign_clades_to_nodes { String docker = "nextstrain/base:build-20211012T204409Z" } String out_basename = basename(basename(tree_nwk, ".nwk"), "_timetree") - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur clades \ @@ -1582,8 +1582,8 @@ task assign_clades_to_nodes { --reference "~{ref_fasta}" \ --clades "~{clades_tsv}" \ --output-node-data "~{out_basename}_clades.json" - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "2 GB" @@ -1616,7 +1616,7 @@ task augur_import_beast { String docker = "nextstrain/base:build-20211012T204409Z" } String tree_basename = basename(beast_mcc_tree, ".tree") - command { + command <<< set -e augur version > VERSION AUGUR_RECURSION_LIMIT=10000 augur import beast \ @@ -1629,8 +1629,8 @@ task augur_import_beast { ~{"--tip-date-delimeter " + tip_date_delimiter} cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: select_first([machine_mem_gb, 3]) + " GB" @@ -1674,7 +1674,7 @@ task export_auspice_json { String docker = "nextstrain/base:build-20211012T204409Z" } - command { + command <<< set -e -o pipefail augur version > VERSION touch exportargs @@ -1728,8 +1728,9 @@ task export_auspice_json { touch "~{out_basename}_auspice_root-sequence.json" cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + set +o pipefail + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> runtime { docker: docker memory: "32 GB" diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 203393ee5..f8871b9aa 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -502,17 +502,18 @@ task compare_two_genomes { String docker = "quay.io/broadinstitute/viral-assemble:2.1.16.1" } - command { + command <<< set -ex -o pipefail assembly.py --version | tee VERSION - assembly.py alignment_summary "${genome_one}" "${genome_two}" --outfileName "${out_basename}.txt" --printCounts --loglevel=DEBUG + assembly.py alignment_summary "~{genome_one}" "~{genome_two}" --outfileName "~{out_basename}.txt" --printCounts --loglevel=DEBUG cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES - } + set +o pipefail + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + >>> output { - File comparison_table = "${out_basename}.txt" + File comparison_table = "~{out_basename}.txt" Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) Int runtime_sec = ceil(read_float("UPTIME_SEC")) String cpu_load = read_string("CPU_LOAD") @@ -522,7 +523,7 @@ task compare_two_genomes { runtime { memory: "3 GB" cpu: 2 - docker: "${docker}" + docker: docker disks: "local-disk 50 HDD" dx_instance_type: "mem1_ssd1_v2_x2" preemptible: 1 diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 89b1a8996..881157855 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -35,6 +35,7 @@ task zcat { Int cpus = 4 } command <<< + set -e python3 < UPTIME_SEC cat /proc/loadavg > CPU_LOAD - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES >>> runtime { docker: "quay.io/broadinstitute/viral-core:2.1.33"