From 7d8c7129510cf1271f0a731465be3662e8ee3ac4 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Fri, 6 Sep 2024 09:35:30 -0400 Subject: [PATCH 1/3] mem and time adjust --- modules/gridss/gridss_assemble.nf | 4 ++-- modules/python/python_somatic_vcf_finalization.nf | 2 +- modules/python/python_somatic_vcf_finalization_mouse.nf | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/gridss/gridss_assemble.nf b/modules/gridss/gridss_assemble.nf index 2524c59..d6e04dd 100644 --- a/modules/gridss/gridss_assemble.nf +++ b/modules/gridss/gridss_assemble.nf @@ -2,7 +2,7 @@ process GRIDSS_ASSEMBLE { tag "$sampleID" cpus = 4 - memory = 40.GB + memory = 50.GB time = '20:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} @@ -15,7 +15,7 @@ process GRIDSS_ASSEMBLE { tuple val(sampleID), path('gridss_assemble/'), emit: gridss_assembly script: - String my_mem = (task.memory-1.GB).toString() + String my_mem = (task.memory-5.GB).toString() my_mem = my_mem[0..-4]+'g' output_dir = 'gridss_assemble/' diff --git a/modules/python/python_somatic_vcf_finalization.nf b/modules/python/python_somatic_vcf_finalization.nf index 247e37a..379c71b 100644 --- a/modules/python/python_somatic_vcf_finalization.nf +++ b/modules/python/python_somatic_vcf_finalization.nf @@ -3,7 +3,7 @@ process SOMATIC_VCF_FINALIZATION { cpus 1 memory 50.GB - time 1.hour + time '3:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' diff --git a/modules/python/python_somatic_vcf_finalization_mouse.nf b/modules/python/python_somatic_vcf_finalization_mouse.nf index 9c73f4b..89a93c8 100644 --- a/modules/python/python_somatic_vcf_finalization_mouse.nf +++ b/modules/python/python_somatic_vcf_finalization_mouse.nf @@ -3,7 +3,7 @@ process SOMATIC_VCF_FINALIZATION { cpus 1 memory 50.GB - time 1.hour + time '3:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' From 67b52a0229de6f88dbc0bd44fdfcc4acfb4d8ce5 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Thu, 26 Sep 2024 09:46:44 -0400 Subject: [PATCH 2/3] v0.7.3 updates --- ReleaseNotes.md | 60 +++++++++++++++ bin/gbrs/generate_emission_prob_avecs.py | 24 +++--- bin/help/atac.nf | 4 +- bin/log/atac.nf | 2 + bin/pta/annotate-bedpe-with-cnv.r | 18 +++-- bin/pta/annotate-cnv-delly.r | 16 +++- bin/pta/delly_cnv_plot.r | 17 ++++- bin/shared/extract_csv.nf | 5 ++ ...dd_cancer_resistance_mutations_germline.nf | 4 +- ...vert_peak.nf => g2gtools_chain_convert.nf} | 0 modules/g2gtools/g2gtools_vci_convert.nf | 26 +++++++ modules/gridss/gridss_assemble.nf | 12 +-- modules/gridss/gripss_somatic_filter.nf | 7 +- modules/illumina/manta.nf | 4 +- modules/picard/picard_mergesamfiles.nf | 4 +- modules/r/plot_delly_cnv.nf | 2 +- .../samtools_chain_sort_fixmate_bam.nf | 6 +- .../samtools/samtools_non_chain_reindex.nf | 4 +- modules/samtools/samtools_stats_insertsize.nf | 2 +- modules/svaba/svaba.nf | 4 +- nextflow.config | 4 +- subworkflows/aria_download_parse.nf | 16 +++- subworkflows/concatenate_local_files.nf | 22 ++++-- workflows/atac.nf | 73 ++++++++++++++++--- 24 files changed, 270 insertions(+), 66 deletions(-) rename modules/g2gtools/{g2gtools_chain_convert_peak.nf => g2gtools_chain_convert.nf} (100%) create mode 100644 modules/g2gtools/g2gtools_vci_convert.nf diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 6c4164b..daad9dc 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,5 +1,65 @@ # RELEASE NOTES + +## Release 0.7.3 + +In this release we make a updates to the ATAC workflow, and correct issues related to the PTA workflow. + +ATAC: +* Merging of replicate samples is now supported. Use the `--merge_replicates` option, along with a CSV input file. See the [wiki page](https://github.com/TheJacksonLaboratory/cs-nf-pipelines/wiki/ATAC-Pipeline-ReadMe#csv-input-sample-sheet) for details on CSV setup. +* GRCm39 pseudo-references generated with G2Gtools are now supported. Previously, GRCm38 was supported via the `--chain` option. For GRCm39, VCI files are required input also specified with `--chain` + +PTA: +* The mouse PTA workflow would crash when all somatic CNVs were filtered, we have corrected this. +* Numerous adjustments to adjustments to memory and wall clock limits were made to support high coverage WGS data. + +### Pipelines Added: + +None + +### Modules Added: + +1. modules/g2gtools/g2gtools_vci_convert.nf + +### Pipeline Changes: + +1. workflows/atac.nf: Replicate merging added. GRCm39 pseudo-reference support added. +1. subworkflows/aria_download_parse.nf: Support for replicate merging added. +1. subworkflows/concatenate_local_files.nf: Support for replicate merging added. + +### Module Changes: + +1. modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf: wallclock and memory request increase. +1. modules/gridss/gridss_assemble.nf: memory request increase, and java heap adjustment. +1. modules/gridss/gripss_somatic_filter.nf: memory request increase, and java heap adjustment. +1. modules/illumina/manta.nf: memory and wallclock requests were made flat rather than scaled to input file size. +1. modules/picard/picard_mergesamfiles.nf: correct `file` vs. `path` nextflow issue. +1. modules/python/python_somatic_vcf_finalization.nf: wallclock requests increase. +1. modules/python/python_somatic_vcf_finalization_mouse.nf: wallclock requests increase. +1. modules/r/plot_delly_cnv.nf: add dynamic plot naming based on `sampleID` +1. modules/samtools/samtools_chain_sort_fixmate_bam.nf: alter module to re-sort final filtered BAM prior to possible replicate merge. +1. modules/samtools/samtools_non_chain_reindex.nf: alter module to re-sort final filtered BAM prior to possible replicate merge. +1. modules/samtools/samtools_stats_insertsize.nf: wallclock request increase. +1. modules/svaba/svaba.nf: memory and wallclock requests increase. + +### Scripts Added: + +None + +### Script Changes: + +1. bin/gbrs/generate_emission_prob_avecs.py: Modify for use with non-DO strain IDs and dynamic number of strains. +1. bin/pta/annotate-bedpe-with-cnv.r: Capture edge case where all somatic CNV are filtered. +1. bin/pta/annotate-cnv-delly.r: Capture edge case where all somatic CNV are filtered. +1. bin/pta/delly_cnv_plot.r: Capture edge case where all somatic CNV are filtered. + + +### NF-Test Modules Added: + +None + + + ## Release 0.7.2 In this minor release we correct a bug in `--workflow atac`. In this workflow, the `macs2` module was configured to use a user defined parameter `tmpdir` for scratch space. However, if the specified `tmpdir` did not exist, `macs2` would fail silently, and allow the workflow to continue. This behavior has been fixed. diff --git a/bin/gbrs/generate_emission_prob_avecs.py b/bin/gbrs/generate_emission_prob_avecs.py index 0e38983..a54fb5c 100644 --- a/bin/gbrs/generate_emission_prob_avecs.py +++ b/bin/gbrs/generate_emission_prob_avecs.py @@ -20,9 +20,12 @@ parser.add_option("-g", "--gene2transcripts_file", dest="gene2transcripts", help="the emase gene2transcripts file from the 'prepare_emase' workflow", metavar="emase.gene2transcripts.tsv") -parser.add_option("-m", "--metadata_file", dest="metadata_file", default="A, B, C, D, E, F, G, H", +parser.add_option("-m", "--metadata_file", dest="metadata_file", help="comma delimited metadata file. File must have headers 'do_id' and 'sampleID' where: do_id is ('A', 'B', ..., 'G', 'H'), and sampleID are the IDs produced and used by 'run_emase' (i.e., output directory names) ", metavar="metadata.csv") +parser.add_option("-s", "--strains", dest="strains", default="A,B,C,D,E,F,G,H", + help="comma delimited list of strains", metavar="A,B,C,D,E,F,G,H") + parser.add_option("-e", "--min_expression", dest="min_expression", default="2", help="minimum expression count to include a gene in Avec computation.", metavar="EXP_MIN") @@ -64,7 +67,7 @@ def unit_vector(vector): header = fh.readline().rstrip().split(",") try: - res = [header.index(i) for i in ['do_id', 'sampleID']] + res = [header.index(i) for i in ['hap_id', 'sampleID']] except Exception as e: print('Error processing metadata file:', e) @@ -75,7 +78,7 @@ def unit_vector(vector): sample_id = dict(sample_id) -print('Found the following keys in "do_id": ' + str(sample_id.keys())) +print('Found the following keys in "hap_id": ' + str(sample_id.keys())) print('The total number of samples by strain in the metadata file is: ') @@ -84,7 +87,8 @@ def unit_vector(vector): print('Found the following sample counts matched to sample IDs in the input directory...') -strains = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H') ### This line will require generalization for non DO data. +strains = [i.strip() for i in options.strains.split(',')] + num_strains = len(strains) dlist = defaultdict(list) @@ -112,7 +116,7 @@ def unit_vector(vector): for curline in fh: item = curline.rstrip().split("\t") row = gid[item[0]] - dmat_sample[row, :] = list(map(float, item[1:9])) + dmat_sample[row, :] = list(map(float, item[1:num_strains+1])) dmat_strain += dmat_sample dset[st] = dmat_strain / len(dlist[st]) @@ -133,9 +137,9 @@ def unit_vector(vector): for g in gname: all_genes.add(g) - axes[g] = np.zeros((8, 8)) - ases[g] = np.zeros((1, 8)) - good = np.zeros(8) + axes[g] = np.zeros((num_strains, num_strains)) + ases[g] = np.zeros((1, num_strains)) + good = np.zeros(num_strains) for i, st in enumerate(strains): v = dset[st][gid[g], :] axes[g][i, :] = v @@ -143,9 +147,9 @@ def unit_vector(vector): if sum(v) > min_expr: good[i] = 1.0 if sum(good) > 0: # At least one strain expresses - avecs[g] = np.zeros((8, 8)) + avecs[g] = np.zeros((num_strains, num_strains)) passed_genes.add(g) - for i in range(8): + for i in range(num_strains): avecs[g][i, :] = unit_vector(axes[g][i, :]) with open(os.path.join(options.output_directory, options.output_prefix + '.included_genes.txt'), 'w') as f: diff --git a/bin/help/atac.nf b/bin/help/atac.nf index 51717a4..9f47314 100644 --- a/bin/help/atac.nf +++ b/bin/help/atac.nf @@ -12,12 +12,14 @@ Parameter | Default | Description --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). --concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. ---csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,[replicate],fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Replicate is optional and used only when `--merge_replicates` is used. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. --download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. --gen_org | mouse | Options: mouse or human. --genome_build | 'GRCm38' | Options: GRCm38 or GRCm39 +--merge_replicates | false | If specified, replicates will be merged based on the 'replicate' field in the CSV input file. + --effective_genome_size | The length of the “mappable” genome. | See : 'https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html'. --chain | null | The default value for Mouse Reference Strain - g2gtools chain file to adjust coordinates to reference diff --git a/bin/log/atac.nf b/bin/log/atac.nf index c724e0c..9ec4640 100644 --- a/bin/log/atac.nf +++ b/bin/log/atac.nf @@ -27,6 +27,7 @@ ______________________________________________________ -c ${params.config} --pubdir ${params.pubdir} --organize_by ${params.organize_by} +--merge_replicates ${params.merge_replicates} --bowtie2Index ${params.bowtie2Index} --bowtieMaxInsert ${params.bowtieMaxInsert} --bowtieVSensitive ${params.bowtieVSensitive} @@ -63,6 +64,7 @@ ______________________________________________________ -c ${params.config} --pubdir ${params.pubdir} --organize_by ${params.organize_by} +--merge_replicates ${params.merge_replicates} --effective_genome_size ${params.effective_genome_size} --chain ${params.chain} --bowtie2Index ${params.bowtie2Index} diff --git a/bin/pta/annotate-bedpe-with-cnv.r b/bin/pta/annotate-bedpe-with-cnv.r index 982b9b9..7555696 100644 --- a/bin/pta/annotate-bedpe-with-cnv.r +++ b/bin/pta/annotate-bedpe-with-cnv.r @@ -33,10 +33,18 @@ readBEDPE = function(f) { ## Read a headered, tab-delimited CNV file into a GRanges object readCNV = function(f) { - x = read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#') - colnames(x)[1:3] = c('chr','start','end') - x = makeGRangesFromDataFrame(x) - + x <- tryCatch( + { + read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#') + colnames(x)[1:3] = c('chr','start','end') + x = makeGRangesFromDataFrame(x) + }, + error = function(e) { + GRanges() + } + ) + # if CNV CSV is empty (i.e., no somatic CNV), return empty GRanges object. + return(x) } @@ -127,8 +135,6 @@ option_list = list( make_option(c("-o", "--out_file"), type='character', help="Output BEDPE")) opt = parse_args(OptionParser(option_list=option_list)) - - ## Read bedpe sv = readBEDPE(opt$bedpe) diff --git a/bin/pta/annotate-cnv-delly.r b/bin/pta/annotate-cnv-delly.r index 828500c..5aad94b 100644 --- a/bin/pta/annotate-cnv-delly.r +++ b/bin/pta/annotate-cnv-delly.r @@ -219,7 +219,7 @@ annotateEnsembl = function(x, ens, closest.max.distance=CLOSEST_MAX_DISTANCE) { ## Collect arguments option_list = list( - make_option(c("-c", "--cnv"), type='character', help="Input CNV calles"), + make_option(c("-c", "--cnv"), type='character', help="Input CNV calls"), make_option(c("-a", "--caller"), type='character', help="Name of tool used to call CNVs in --cnv (only delly is currently supported)"), make_option(c("-t", "--tumor"), type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"), make_option(c("-n", "--normal"), type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"), @@ -241,7 +241,19 @@ opt$allowed_chr = unlist(strsplit(opt$allowed_chr, ',', fixed=T)) ## Read files -cnv = readCNV(opt$cnv, chr=opt$allowed_chr) +cnv <- tryCatch( + { + readCNV(opt$cnv, chr=opt$allowed_chr) + }, + error = function(e) { + empty_df <- setNames(data.frame(matrix(ncol = 10, nrow = 0)), c('#chr', 'start', 'end', 'type', 'log2', 'tool', 'tumor--normal', 'info', 'focal', 'cytoband')) + + write.table(empty_df, opt$out_file_main, row.names=F, col.names=T, sep='\t', quote=F) + write.table(empty_df, opt$out_file_supplemental, row.names=F, col.names=T, sep='\t', quote=F) + + quit(status=0, save='no') + } +) cyto = readCytoband(opt$cytoband) ensembl = readEnsembl(opt$ensembl) diff --git a/bin/pta/delly_cnv_plot.r b/bin/pta/delly_cnv_plot.r index cfaa6b0..da6bb75 100644 --- a/bin/pta/delly_cnv_plot.r +++ b/bin/pta/delly_cnv_plot.r @@ -12,7 +12,18 @@ minCN = 0 maxCN = 8 seg = data.frame() if (length(args)>1) { - seg = read.table(args[2], header=F, sep="\t") + seg <- tryCatch( + { + read.table(args[2], header=F, sep="\t") + }, + error = function(e) { + p <- ggplot() + + annotate("text", x = 0.5, y = 0.5, label = "No somatic CNV segments", size = 10, hjust = 0.5, vjust = 0.5) + + theme_void() + ggsave(p, file=paste0(args[3], ".plot.wholegenome.png"), width=24, height=6) + quit(status=0, save='no') + } + ) colnames(seg) = c("chr", "start", "end", "id", "cn") } @@ -36,7 +47,7 @@ p = p + facet_grid(. ~ chr, scales="free_x", space="free_x") p = p + ylim(minCN, maxCN) p = p + theme(axis.text.x = element_text(angle=45, hjust=1)) p = p + ggtitle(args[1]) -ggsave(p, file="plot.wholegenome.png", width=24, height=6) +ggsave(p, file=paste0(args[3], ".plot.wholegenome.png"), width=24, height=6) print(warnings()) # By chromosome @@ -51,7 +62,7 @@ for(chrname in unique(x$chr)) { if (nrow(sl)) { p = p + geom_segment(data=sl, aes(x=start, y=cn, xend=end, yend=cn), color="#3831a3", size=1.2); } p = p + ylim(minCN, maxCN) p = p + theme(axis.text.x = element_text(angle=45, hjust=1)) - ggsave(p, file=paste0("plot.", chrname, ".png"), width=24, height=6) + ggsave(p, file=paste0(args[3], ".plot.", chrname, ".png"), width=24, height=6) print(warnings()) } diff --git a/bin/shared/extract_csv.nf b/bin/shared/extract_csv.nf index 46b7045..44403db 100644 --- a/bin/shared/extract_csv.nf +++ b/bin/shared/extract_csv.nf @@ -43,6 +43,11 @@ def extract_csv(csv_file) { if (row.lane) meta.lane = row.lane.toString() else meta.lane = 'NA' + // If no replicate specified, replicate is not considered + if (row.replicate) meta.replicate = row.replicate.toString() + else meta.replicate = 'NA' + + /* NOTE: Additional metadata parsing could be added here. This function is a minimal implimentation of a csv parser. */ diff --git a/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf index 3b4063f..d5e6275 100644 --- a/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf +++ b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf @@ -2,8 +2,8 @@ process COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE { tag "$sampleID" cpus 1 - memory 5.GB - time 1.hour + memory 10.GB + time 3.hour errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' diff --git a/modules/g2gtools/g2gtools_chain_convert_peak.nf b/modules/g2gtools/g2gtools_chain_convert.nf similarity index 100% rename from modules/g2gtools/g2gtools_chain_convert_peak.nf rename to modules/g2gtools/g2gtools_chain_convert.nf diff --git a/modules/g2gtools/g2gtools_vci_convert.nf b/modules/g2gtools/g2gtools_vci_convert.nf new file mode 100644 index 0000000..8d5b828 --- /dev/null +++ b/modules/g2gtools/g2gtools_vci_convert.nf @@ -0,0 +1,26 @@ +process VCI_CONVERT { + tag "$sampleID" + + cpus 1 + memory 10.GB + time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/g2gtools:2.0.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'g2gtools' }", pattern: "*.log", mode:'copy' + + input: + tuple val(sampleID), file(bam_shifted) + + output: + tuple val(sampleID), file("*.tmp.mm10.bam"), emit: converted_bam + tuple val(sampleID), file("*g2gconvert.log"), emit: log + + when: params.chain != null + + script: + """ + g2gtools convert -i ${bam_shifted[0]} -c ${params.chain} --file-format bam -r -o ${sampleID}.tmp.mm10.bam 2> ${sampleID}_g2gconvert.log + """ +} diff --git a/modules/gridss/gridss_assemble.nf b/modules/gridss/gridss_assemble.nf index d6e04dd..465579d 100644 --- a/modules/gridss/gridss_assemble.nf +++ b/modules/gridss/gridss_assemble.nf @@ -2,7 +2,7 @@ process GRIDSS_ASSEMBLE { tag "$sampleID" cpus = 4 - memory = 50.GB + memory = 100.GB time = '20:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} @@ -15,9 +15,10 @@ process GRIDSS_ASSEMBLE { tuple val(sampleID), path('gridss_assemble/'), emit: gridss_assembly script: - String my_mem = (task.memory-5.GB).toString() - my_mem = my_mem[0..-4]+'g' - + String my_mem = (task.memory-1.GB).toString() + heap_mem = my_mem[0..-4]+'g' + other_mem = my_mem[0..-30]+'g' + output_dir = 'gridss_assemble/' """ @@ -31,7 +32,8 @@ process GRIDSS_ASSEMBLE { fi gridss \ - --jvmheap "${my_mem}" \ + --jvmheap "${heap_mem}" \ + --otherjvmheap "${other_mem}" \ --steps assemble \ --reference "${params.combined_reference_set}" \ --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \ diff --git a/modules/gridss/gripss_somatic_filter.nf b/modules/gridss/gripss_somatic_filter.nf index bfb9d5a..dc8453a 100644 --- a/modules/gridss/gripss_somatic_filter.nf +++ b/modules/gridss/gripss_somatic_filter.nf @@ -1,11 +1,10 @@ - process GRIPSS_SOMATIC_FILTER { tag "$sampleID" cpus = 1 - memory = 5.GB + memory = 30.GB time = '01:00:00' - errorStrategy 'ignore' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/hmftools-gripss:2.3.2--hdfd78af_0' @@ -26,7 +25,7 @@ process GRIPSS_SOMATIC_FILTER { script: """ - gripss -Xmx5g \ + gripss -Xmx29g \ -sample ${tumor_name} \ -reference ${normal_name} \ -ref_genome_version 38 \ diff --git a/modules/illumina/manta.nf b/modules/illumina/manta.nf index a960123..f1be49f 100644 --- a/modules/illumina/manta.nf +++ b/modules/illumina/manta.nf @@ -2,8 +2,8 @@ process MANTA { tag "$sampleID" cpus = 4 - memory { normal_bam.size() < 60.GB ? 12.GB : 24.GB } - time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' } + memory 24.GB + time '12:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/manta:v1.5.0' diff --git a/modules/picard/picard_mergesamfiles.nf b/modules/picard/picard_mergesamfiles.nf index ac52496..26f7472 100644 --- a/modules/picard/picard_mergesamfiles.nf +++ b/modules/picard/picard_mergesamfiles.nf @@ -14,10 +14,10 @@ process PICARD_MERGESAMFILES { input: - tuple val(sampleID), file(bam) + tuple val(sampleID), path(bam) output: - tuple val(sampleID), file("*.bam"), emit: bam + tuple val(sampleID), path("*.bam"), emit: bam script: String my_mem = (task.memory-1.GB).toString() diff --git a/modules/r/plot_delly_cnv.nf b/modules/r/plot_delly_cnv.nf index e7bd18c..5027202 100644 --- a/modules/r/plot_delly_cnv.nf +++ b/modules/r/plot_delly_cnv.nf @@ -18,6 +18,6 @@ process PLOT_DELLY_CNV { script: """ - Rscript ${projectDir}/bin/pta/delly_cnv_plot.r ${cov} ${seg_bed} + Rscript ${projectDir}/bin/pta/delly_cnv_plot.r ${cov} ${seg_bed} ${sampleID} """ } diff --git a/modules/samtools/samtools_chain_sort_fixmate_bam.nf b/modules/samtools/samtools_chain_sort_fixmate_bam.nf index 40aa920..2c71b9d 100644 --- a/modules/samtools/samtools_chain_sort_fixmate_bam.nf +++ b/modules/samtools/samtools_chain_sort_fixmate_bam.nf @@ -32,7 +32,7 @@ process CHAIN_SORT_FIXMATE_BAM { samtools fixmate \ -O bam ${sampleID}.tmp3.mm10.bam ${sampleID}.tmp4.mm10.bam - # re-sort bam by coordinates + # re-sort bam by coordinates. This step is required for the MT filter to work properly samtools sort \ -@ $task.cpus -O bam \ -o ${sampleID}.tmp5.mm10.bam ${sampleID}.tmp4.mm10.bam @@ -44,9 +44,11 @@ process CHAIN_SORT_FIXMATE_BAM { -h 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y \ | grep -ve 'SN:MT*' > tmp.sam - samtools view -b tmp.sam \ + # re-sort following MT removal. This is required for PICARD MERGE + samtools sort -@ $task.cpus tmp.sam \ > ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam + # Index BAM samtools index \ ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam """ diff --git a/modules/samtools/samtools_non_chain_reindex.nf b/modules/samtools/samtools_non_chain_reindex.nf index 2e2cf3f..f024be5 100644 --- a/modules/samtools/samtools_non_chain_reindex.nf +++ b/modules/samtools/samtools_non_chain_reindex.nf @@ -29,9 +29,11 @@ process NON_CHAIN_REINDEX { -h 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y \ | grep -ve 'SN:MT*\\|SN:GL*\\|SN:JH:*' > tmp.sam - samtools view -b tmp.sam \ + # Re-sort BAM following MT removal. This is required for PICARD MERGE + samtools sort -@ $task.cpus tmp.sam \ > ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam + # Index BAM samtools index \ ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam """ diff --git a/modules/samtools/samtools_stats_insertsize.nf b/modules/samtools/samtools_stats_insertsize.nf index 4d8b5e5..0a7e2f8 100644 --- a/modules/samtools/samtools_stats_insertsize.nf +++ b/modules/samtools/samtools_stats_insertsize.nf @@ -3,7 +3,7 @@ process SAMTOOLS_STATS_INSERTSIZE { cpus 8 memory 1.GB - time '01:00:00' + time '02:00:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/samtools:1.14--hb421002_0' diff --git a/modules/svaba/svaba.nf b/modules/svaba/svaba.nf index 109d984..c1d198f 100644 --- a/modules/svaba/svaba.nf +++ b/modules/svaba/svaba.nf @@ -2,8 +2,8 @@ process SVABA { tag "$sampleID" cpus = 8 - memory { normal_bam.size() < 60.GB ? 15.GB : 48.GB } - time { normal_bam.size() < 60.GB ? '10:00:00' : '24:00:00' } + memory { normal_bam.size() < 60.GB ? 30.GB : 48.GB } + time { normal_bam.size() < 60.GB ? '18:00:00' : '24:00:00' } errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/svaba:v0.2.1' diff --git a/nextflow.config b/nextflow.config index 181497d..89ac3b2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,7 +21,7 @@ params { keep_intermediate = false // true fastq2 = true // default is PE for workflows tmpdir = "/flashscratch/${USER}" // generic param - + merge_replicates = false // get help help = null @@ -45,7 +45,7 @@ manifest { homePage = "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" mainScript = "main.nf" nextflowVersion = "!>=22.04.3" - version = "0.7.2" + version = "0.7.3" author = 'Michael Lloyd, Brian Sanderson, Barry Guglielmo, Sai Lek, Peter Fields, Harshpreet Chandok, Carolyn Paisie, Gabriel Rech, Ardian Ferraj, Tejas Temker, Anuj Srivastava. Copyright Jackson Laboratory 2024' } diff --git a/subworkflows/aria_download_parse.nf b/subworkflows/aria_download_parse.nf index d756c6a..b2e06d1 100644 --- a/subworkflows/aria_download_parse.nf +++ b/subworkflows/aria_download_parse.nf @@ -28,15 +28,25 @@ workflow FILE_DOWNLOAD { if (params.read_type == 'PE') { aria_download_input = ch_input_sample .multiMap { it -> - R1: tuple(it[0], it[1], 'R1', it[2]) - R2: tuple(it[0], it[1], 'R2', it[3]) + if (params.merge_replicates) { + sampleID = it[1].sampleID+'_'+it[1].replicate + } else { + sampleID = it[1].sampleID + } + R1: tuple(sampleID, it[1], 'R1', it[2]) + R2: tuple(sampleID, it[1], 'R2', it[3]) } .mix() group_size = 2 } else { aria_download_input = ch_input_sample .multiMap { it -> - R1: tuple(it[0], it[1], 'R1', it[2]) + if (params.merge_replicates) { + sampleID = it[1].sampleID+'_'+it[1].replicate + } else { + sampleID = it[1].sampleID + } + R1: tuple(sampleID, it[1], 'R1', it[2]) } .mix() group_size = 1 diff --git a/subworkflows/concatenate_local_files.nf b/subworkflows/concatenate_local_files.nf index 36fb8a3..9f9045a 100644 --- a/subworkflows/concatenate_local_files.nf +++ b/subworkflows/concatenate_local_files.nf @@ -15,9 +15,15 @@ workflow CONCATENATE_LOCAL_FILES { temp_map = ch_input_sample .multiMap { it -> def meta = [:] - meta.sampleID = it[1].sampleID - R1: tuple(it[0], it[1].lane, meta, 'R1', it[2]) - R2: tuple(it[0], it[1].lane, meta, 'R2', it[3]) + if (params.merge_replicates) { + meta.sampleID = it[1].replicate != 'NA' ? it[1].sampleID+'_'+it[1].replicate : it[1].sampleID + meta.replicate = it[1].replicate + meta.baseSampleID = it[1].sampleID + } else { + meta.sampleID = it[1].sampleID + } + R1: tuple(meta.sampleID, meta.lane, meta, 'R1', it[2]) + R2: tuple(meta.sampleID, meta.lane, meta, 'R2', it[3]) } .mix() .groupTuple(by: [0,2,3]) @@ -34,8 +40,14 @@ workflow CONCATENATE_LOCAL_FILES { temp_map = ch_input_sample .multiMap { it -> def meta = [:] - meta.sampleID = it[1].sampleID - R1: tuple(it[0], it[1].lane, meta, 'R1', it[2]) + if (params.merge_replicates) { + meta.sampleID = it[1].replicate != 'NA' ? it[1].sampleID+'_'+it[1].replicate : it[1].sampleID + meta.replicate = it[1].replicate + meta.baseSampleID = it[1].sampleID + } else { + meta.sampleID = it[1].sampleID + } + R1: tuple(meta.sampleID, meta.lane, meta, 'R1', it[2]) } .groupTuple(by: [0,2,3]) .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1], file diff --git a/workflows/atac.nf b/workflows/atac.nf index 5f6e56a..1d2c21e 100755 --- a/workflows/atac.nf +++ b/workflows/atac.nf @@ -22,12 +22,16 @@ include {REMOVE_DUPLICATE_READS} from "${projectDir}/modules/samtools/samtools_r include {CALC_MTDNA_FILTER_CHRM} from "${projectDir}/modules/samtools/samtools_calc_mtdna_filter_chrm" include {FILTER_REMOVE_MULTI_SHIFT} from "${projectDir}/modules/samtools/samtools_filter_remove_multi_shift" include {FILTER_REMOVE_MULTI_SIEVE} from "${projectDir}/modules/deeptools/deeptools_filter_remove_multi_sieve" -include {CHAIN_CONVERT} from "${projectDir}/modules/g2gtools/g2gtools_chain_convert_peak" +include {CHAIN_CONVERT} from "${projectDir}/modules/g2gtools/g2gtools_chain_convert" +include {VCI_CONVERT} from "${projectDir}/modules/g2gtools/g2gtools_vci_convert" include {CHAIN_EXTRACT_BADREADS} from "${projectDir}/modules/gatk/gatk_chain_extract_badreads" include {CHAIN_BAD2UNIQ_READS} from "${projectDir}/modules/samtools/samtools_chain_bad2uniq_reads" include {CHAIN_FILTER_READS} from "${projectDir}/modules/gatk/gatk_chain_filter_reads" include {CHAIN_SORT_FIXMATE_BAM} from "${projectDir}/modules/samtools/samtools_chain_sort_fixmate_bam" include {NON_CHAIN_REINDEX} from "${projectDir}/modules/samtools/samtools_non_chain_reindex" +include {SAMTOOLS_INDEX; + SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED} from "${projectDir}/modules/samtools/samtools_index" +include {PICARD_MERGESAMFILES} from "${projectDir}/modules/picard/picard_mergesamfiles" include {PEAK_CALLING} from "${projectDir}/modules/macs2/macs2_peak_calling" include {BAM_COVERAGE_BIGWIG} from "${projectDir}/modules/deeptools/deeptools_bam_coverage_bigwig" include {FRIP_READS_IN_PEAKS} from "${projectDir}/modules/bedtools/bedtools_frip_reads_in_peaks" @@ -54,6 +58,18 @@ if (params.download_data && !params.csv_input) { exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." } +if (params.merge_replicates && !params.csv_input) { + exit 1, "Replicate merging was requested with `--merge_replicates`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--merge_replicates` requires a CSV manifest. See `--help` for information." +} + +if (!(params.genome_build in ['GRCm38', 'GRCm39', 'GRCh38'])) { + exit 1, "Invalid genome build specified. Please use one of the following: GRCm38, GRCm39, GRCh38." +} + +if (params.gen_org == 'human' && params.genome_build != 'GRCh38') { + exit 1, "Invalid genome build specified for human. Please use GRCh38." +} + // prepare reads channel if (params.csv_input) { @@ -164,11 +180,16 @@ workflow ATAC { // If Mouse if (params.gen_org=='mouse'){ // Step 11: Convert peak coordinates - // Step occurs when chain != null || chain != false - CHAIN_CONVERT(SORT_SHIFTED_BAM.out.sorted_file) - + // Chain and VCI steps occurs when chain != null || chain != false + if (params.genome_build == 'GRCm38') { + CHAIN_CONVERT(SORT_SHIFTED_BAM.out.sorted_file) + converted_bam = CHAIN_CONVERT.out.converted_bam + } else if (params.genome_build == 'GRCm39') { + VCI_CONVERT(SORT_SHIFTED_BAM.out.sorted_file) + converted_bam = VCI_CONVERT.out.converted_bam + } // Step 12: Sort bam by coordinates - SORT_LIFTOVER_BAM(CHAIN_CONVERT.out.converted_bam, '-O bam', 'bam') + SORT_LIFTOVER_BAM(converted_bam, '-O bam', 'bam') // Step 13: Extract a list of 'bad reads' CHAIN_EXTRACT_BADREADS(SORT_LIFTOVER_BAM.out.sorted_file) @@ -199,31 +220,59 @@ workflow ATAC { } else if (params.gen_org=='human'){ - data_ch = SORT_SHIFTED_BAM.out.sorted_file - } + SAMTOOLS_INDEX(SORT_SHIFTED_BAM.out.sorted_file) + data_ch = SORT_SHIFTED_BAM.out.sorted_file.join(SAMTOOLS_INDEX.out.bai) + .map{it -> [it[0], [it[1], it[2]]]} + } + + if (params.merge_replicates) { + + data_ch.join(meta_ch) + .map{it -> [it[2].baseSampleID, it[1][0], it[1][1]]} + .groupTuple(by: [0]) + .map { it -> [ it[0], [it[1], it[2]].flatten() ] } + .branch { + merge: it[1].size() > 2 + pass: it[1].size == 2 + } + .set{merge_ch} + // samples with more than 2 replicates will be merged (i.e., [BAM, BAI, BAM, BAI, ...]) + // samples without replicates (i.e., [BAM, BAI]), are passed then merged back to the channel with `mix` + + PICARD_MERGESAMFILES(merge_ch.merge) + SAMTOOLS_INDEX_MERGED(PICARD_MERGESAMFILES.out.bam) + + peak_ch = PICARD_MERGESAMFILES.out.bam.join(SAMTOOLS_INDEX_MERGED.out.bai) + .map{it -> [it[0], [it[1], it[2]]]} + .mix(merge_ch.pass) + // samples where merge was not done are added back to the channel with `mix` + + } else { + peak_ch = data_ch + } // Step 19: Peak calling - PEAK_CALLING(data_ch) + PEAK_CALLING(peak_ch) if (params.gen_org=='mouse'){ // Step 20: deeptools bamCoverage bigwig - BAM_COVERAGE_BIGWIG(data_ch) + BAM_COVERAGE_BIGWIG(peak_ch) } // Step 21: Fraction of reads in peaks (FRiP) - frip_start = data_ch.join(PEAK_CALLING.out.np) + frip_start = peak_ch.join(PEAK_CALLING.out.np) FRIP_READS_IN_PEAKS(frip_start) // Step 22: Final calculate (FRiP) - frip_calc = data_ch.join(FRIP_READS_IN_PEAKS.out[0]) + frip_calc = peak_ch.join(FRIP_READS_IN_PEAKS.out[0]) FINAL_CALC_FRIP(frip_calc) // Step 23: Get coverage in each peak PEAK_COVERAGE(PEAK_CALLING.out.np) // Step 24: Feature counts - feature_counts = data_ch.join(PEAK_COVERAGE.out) + feature_counts = peak_ch.join(PEAK_COVERAGE.out) FEATURE_COUNTS(feature_counts) if (params.gen_org=='mouse'){ From fec7d0f3fbe9e46a923eb87c43f10259933b7d82 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Thu, 26 Sep 2024 09:49:02 -0400 Subject: [PATCH 3/3] add max rss and vmem to trace report --- nextflow.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 89ac3b2..1a6716d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -87,11 +87,13 @@ trace { + "%cpu," \ + "%mem," \ + "rss," \ + + "peak_rss," \ + "vmem," \ + + "peak_vmem," \ + "rchar," \ + "wchar" } - + dag { enabled = true file = "${params.pubdir}/trace/${params.workflow}_dagPlot_${trace_timestamp}.html"