Merged in memory-adjustments (pull request #205)

v0.7.3 updates
TheJacksonLaboratory · Sep 26, 2024 · b08c173 · b08c173
2 parents 31cfe7f + fec7d0f
commit b08c173
Show file tree

Hide file tree

Showing 26 changed files with 274 additions and 68 deletions.
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
@@ -1,5 +1,65 @@
 # RELEASE NOTES
 
+
+## Release 0.7.3
+
+In this release we make a updates to the ATAC workflow, and correct issues related to the PTA workflow.  
+
+ATAC:  
+* Merging of replicate samples is now supported. Use the `--merge_replicates` option, along with a CSV input file. See the [wiki page](https://github.com/TheJacksonLaboratory/cs-nf-pipelines/wiki/ATAC-Pipeline-ReadMe#csv-input-sample-sheet) for details on CSV setup.  
+* GRCm39 pseudo-references generated with G2Gtools are now supported. Previously, GRCm38 was supported via the `--chain` option. For GRCm39, VCI files are required input also specified with `--chain`
+
+PTA:  
+* The mouse PTA workflow would crash when all somatic CNVs were filtered, we have corrected this.  
+* Numerous adjustments to adjustments to memory and wall clock limits were made to support high coverage WGS data.  
+
+### Pipelines Added:
+
+None  
+
+### Modules Added:
+
+1. modules/g2gtools/g2gtools_vci_convert.nf  
+
+### Pipeline Changes:
+
+1. workflows/atac.nf: Replicate merging added. GRCm39 pseudo-reference support added. 
+1. subworkflows/aria_download_parse.nf: Support for replicate merging added. 
+1. subworkflows/concatenate_local_files.nf: Support for replicate merging added. 
+
+### Module Changes:
+
+1. modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf: wallclock and memory request increase.  
+1. modules/gridss/gridss_assemble.nf: memory request increase, and java heap adjustment.
+1. modules/gridss/gripss_somatic_filter.nf: memory request increase, and java heap adjustment. 
+1. modules/illumina/manta.nf: memory and wallclock requests were made flat rather than scaled to input file size. 
+1. modules/picard/picard_mergesamfiles.nf: correct `file` vs. `path` nextflow issue.
+1. modules/python/python_somatic_vcf_finalization.nf: wallclock requests increase.
+1. modules/python/python_somatic_vcf_finalization_mouse.nf: wallclock requests increase.
+1. modules/r/plot_delly_cnv.nf: add dynamic plot naming based on `sampleID`
+1. modules/samtools/samtools_chain_sort_fixmate_bam.nf: alter module to re-sort final filtered BAM prior to possible replicate merge.
+1. modules/samtools/samtools_non_chain_reindex.nf: alter module to re-sort final filtered BAM prior to possible replicate merge.
+1. modules/samtools/samtools_stats_insertsize.nf: wallclock request increase.  
+1. modules/svaba/svaba.nf: memory and wallclock requests increase.  
+
+### Scripts Added:
+
+None
+
+### Script Changes:
+
+1. bin/gbrs/generate_emission_prob_avecs.py: Modify for use with non-DO strain IDs and dynamic number of strains.
+1. bin/pta/annotate-bedpe-with-cnv.r: Capture edge case where all somatic CNV are filtered. 
+1. bin/pta/annotate-cnv-delly.r: Capture edge case where all somatic CNV are filtered. 
+1. bin/pta/delly_cnv_plot.r: Capture edge case where all somatic CNV are filtered. 
+
+
+### NF-Test Modules Added: 
+
+None
+
+
+
 ## Release 0.7.2
 
 In this minor release we correct a bug in `--workflow atac`. In this workflow, the `macs2` module was configured to use a user defined parameter `tmpdir` for scratch space. However, if the specified `tmpdir` did not exist, `macs2` would fail silently, and allow the workflow to continue. This behavior has been fixed.   

diff --git a/bin/gbrs/generate_emission_prob_avecs.py b/bin/gbrs/generate_emission_prob_avecs.py
@@ -20,9 +20,12 @@
 parser.add_option("-g", "--gene2transcripts_file", dest="gene2transcripts",
                   help="the emase gene2transcripts file from the 'prepare_emase' workflow", metavar="emase.gene2transcripts.tsv")
 
-parser.add_option("-m", "--metadata_file", dest="metadata_file", default="A, B, C, D, E, F, G, H",
+parser.add_option("-m", "--metadata_file", dest="metadata_file",
                   help="comma delimited metadata file. File must have headers 'do_id' and 'sampleID' where: do_id is ('A', 'B', ..., 'G', 'H'), and sampleID are the IDs produced and used by 'run_emase' (i.e., output directory names) ", metavar="metadata.csv")
 
+parser.add_option("-s", "--strains", dest="strains", default="A,B,C,D,E,F,G,H",
+                  help="comma delimited list of strains", metavar="A,B,C,D,E,F,G,H")
+
 parser.add_option("-e", "--min_expression", dest="min_expression", default="2",
                   help="minimum expression count to include a gene in Avec computation.", metavar="EXP_MIN")
 
@@ -64,7 +67,7 @@ def unit_vector(vector):
     header = fh.readline().rstrip().split(",")
 
     try:
-        res = [header.index(i) for i in ['do_id', 'sampleID']]
+        res = [header.index(i) for i in ['hap_id', 'sampleID']]
     except Exception as e:
         print('Error processing metadata file:', e)
 
@@ -75,7 +78,7 @@ def unit_vector(vector):
 
 sample_id = dict(sample_id)
 
-print('Found the following keys in "do_id": ' + str(sample_id.keys()))
+print('Found the following keys in "hap_id": ' + str(sample_id.keys()))
 
 print('The total number of samples by strain in the metadata file is: ')
 
@@ -84,7 +87,8 @@ def unit_vector(vector):
 
 print('Found the following sample counts matched to sample IDs in the input directory...')
 
-strains = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H') ### This line will require generalization for non DO data.
+strains = [i.strip() for i in options.strains.split(',')]
+
 num_strains = len(strains)
 
 dlist = defaultdict(list)
@@ -112,7 +116,7 @@ def unit_vector(vector):
             for curline in fh:
                 item = curline.rstrip().split("\t")
                 row = gid[item[0]]
-                dmat_sample[row, :] = list(map(float, item[1:9]))
+                dmat_sample[row, :] = list(map(float, item[1:num_strains+1]))
         dmat_strain += dmat_sample
     dset[st] = dmat_strain / len(dlist[st])
 
@@ -133,19 +137,19 @@ def unit_vector(vector):
 
 for g in gname:
     all_genes.add(g)
-    axes[g] = np.zeros((8, 8))
-    ases[g] = np.zeros((1, 8))
-    good = np.zeros(8)
+    axes[g] = np.zeros((num_strains, num_strains))
+    ases[g] = np.zeros((1, num_strains))
+    good = np.zeros(num_strains)
     for i, st in enumerate(strains):
         v = dset[st][gid[g], :]
         axes[g][i, :] = v
         ases[g][0, i] = sum(v)
         if sum(v) > min_expr:
             good[i] = 1.0
     if sum(good) > 0:  # At least one strain expresses
-        avecs[g] = np.zeros((8, 8))
+        avecs[g] = np.zeros((num_strains, num_strains))
         passed_genes.add(g)
-        for i in range(8):
+        for i in range(num_strains):
             avecs[g][i, :] = unit_vector(axes[g][i, :])
 
 with open(os.path.join(options.output_directory, options.output_prefix + '.included_genes.txt'), 'w') as f:

diff --git a/bin/help/atac.nf b/bin/help/atac.nf
@@ -12,12 +12,14 @@ Parameter | Default | Description
 --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz
 --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE).
 --concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes.
---csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified.
+--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,[replicate],fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Replicate is optional and used only when `--merge_replicates` is used. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified.
 --download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. 
 
 --gen_org | mouse | Options: mouse or human.
 --genome_build | 'GRCm38' | Options: GRCm38 or GRCm39
 
+--merge_replicates | false | If specified, replicates will be merged based on the 'replicate' field in the CSV input file. 
+
 --effective_genome_size | The length of the “mappable” genome. | See : 'https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html'.
                         
 --chain | null | The default value for Mouse Reference Strain -  g2gtools chain file to adjust coordinates to reference                                                              

diff --git a/bin/log/atac.nf b/bin/log/atac.nf
@@ -27,6 +27,7 @@ ______________________________________________________
 -c                              ${params.config}
 --pubdir                        ${params.pubdir}
 --organize_by                   ${params.organize_by}
+--merge_replicates              ${params.merge_replicates}
 --bowtie2Index                  ${params.bowtie2Index}
 --bowtieMaxInsert               ${params.bowtieMaxInsert}
 --bowtieVSensitive              ${params.bowtieVSensitive}
@@ -63,6 +64,7 @@ ______________________________________________________
 -c                              ${params.config}
 --pubdir                        ${params.pubdir}
 --organize_by                   ${params.organize_by}
+--merge_replicates              ${params.merge_replicates}
 --effective_genome_size         ${params.effective_genome_size} 
 --chain                         ${params.chain}
 --bowtie2Index                  ${params.bowtie2Index}

diff --git a/bin/pta/annotate-bedpe-with-cnv.r b/bin/pta/annotate-bedpe-with-cnv.r
@@ -33,10 +33,18 @@ readBEDPE = function(f) {
 ## Read a headered, tab-delimited CNV file into a GRanges object
 readCNV = function(f) {
 
-  x = read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#')
-  colnames(x)[1:3] = c('chr','start','end')
-  x = makeGRangesFromDataFrame(x)
-
+  x <- tryCatch( 
+    {
+        read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#')
+        colnames(x)[1:3] = c('chr','start','end')
+        x = makeGRangesFromDataFrame(x)
+    },
+    error = function(e) {
+        GRanges()
+    }
+  )
+  # if CNV CSV is empty (i.e., no somatic CNV), return empty GRanges object.
+
   return(x)
 
 }
@@ -127,8 +135,6 @@ option_list = list(
   make_option(c("-o", "--out_file"), type='character',  help="Output BEDPE"))
 opt = parse_args(OptionParser(option_list=option_list))
 
-
-
 ## Read bedpe
 sv = readBEDPE(opt$bedpe)
 

diff --git a/bin/pta/annotate-cnv-delly.r b/bin/pta/annotate-cnv-delly.r
@@ -219,7 +219,7 @@ annotateEnsembl = function(x, ens, closest.max.distance=CLOSEST_MAX_DISTANCE) {
 
 ## Collect arguments
 option_list = list(
-  make_option(c("-c", "--cnv"),                   type='character', help="Input CNV calles"),
+  make_option(c("-c", "--cnv"),                   type='character', help="Input CNV calls"),
   make_option(c("-a", "--caller"),                type='character', help="Name of tool used to call CNVs in --cnv (only delly is currently supported)"),
   make_option(c("-t", "--tumor"),                 type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"),
   make_option(c("-n", "--normal"),                type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"),
@@ -241,7 +241,19 @@ opt$allowed_chr = unlist(strsplit(opt$allowed_chr, ',', fixed=T))
 
 
 ## Read files
-cnv = readCNV(opt$cnv, chr=opt$allowed_chr)
+cnv <- tryCatch( 
+  {
+      readCNV(opt$cnv, chr=opt$allowed_chr)
+  },
+  error = function(e) {
+    empty_df <- setNames(data.frame(matrix(ncol = 10, nrow = 0)), c('#chr', 'start', 'end', 'type', 'log2', 'tool', 'tumor--normal', 'info', 'focal', 'cytoband'))
+
+    write.table(empty_df, opt$out_file_main, row.names=F, col.names=T, sep='\t', quote=F)
+    write.table(empty_df, opt$out_file_supplemental, row.names=F, col.names=T, sep='\t', quote=F)
+
+    quit(status=0, save='no')
+  }
+)
 cyto = readCytoband(opt$cytoband)
 ensembl = readEnsembl(opt$ensembl)
 

diff --git a/bin/pta/delly_cnv_plot.r b/bin/pta/delly_cnv_plot.r
@@ -12,7 +12,18 @@ minCN = 0
 maxCN = 8
 seg = data.frame()
 if (length(args)>1) {
-   seg = read.table(args[2], header=F, sep="\t")
+   seg <- tryCatch( 
+      {
+         read.table(args[2], header=F, sep="\t")
+      },
+      error = function(e) {
+         p <- ggplot() + 
+            annotate("text", x = 0.5, y = 0.5, label = "No somatic CNV segments", size = 10, hjust = 0.5, vjust = 0.5) + 
+            theme_void()
+         ggsave(p, file=paste0(args[3], ".plot.wholegenome.png"), width=24, height=6)
+         quit(status=0, save='no')
+      }
+   )
    colnames(seg) = c("chr", "start", "end", "id", "cn")
 }
 
@@ -36,7 +47,7 @@ p = p + facet_grid(. ~ chr, scales="free_x", space="free_x")
 p = p + ylim(minCN, maxCN)
 p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
 p = p + ggtitle(args[1])
-ggsave(p, file="plot.wholegenome.png", width=24, height=6)
+ggsave(p, file=paste0(args[3], ".plot.wholegenome.png"), width=24, height=6)
 print(warnings())
 
 # By chromosome
@@ -51,7 +62,7 @@ for(chrname in unique(x$chr)) {
  if (nrow(sl)) { p = p + geom_segment(data=sl, aes(x=start, y=cn, xend=end, yend=cn), color="#3831a3", size=1.2); }
  p = p + ylim(minCN, maxCN)
  p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
- ggsave(p, file=paste0("plot.", chrname, ".png"), width=24, height=6)
+ ggsave(p, file=paste0(args[3], ".plot.", chrname, ".png"), width=24, height=6)
  print(warnings())
 }
 
diff --git a/bin/shared/extract_csv.nf b/bin/shared/extract_csv.nf
@@ -43,6 +43,11 @@ def extract_csv(csv_file) {
         if (row.lane) meta.lane = row.lane.toString()
         else meta.lane = 'NA'
 
+        // If no replicate specified, replicate is not considered
+        if (row.replicate) meta.replicate = row.replicate.toString()
+        else meta.replicate = 'NA'
+
+
         /* 
             NOTE: Additional metadata parsing could be added here. This function is a minimal implimentation of a csv parser. 
         */

diff --git a/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf
@@ -2,8 +2,8 @@ process COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE {
     tag "$sampleID"
 
     cpus 1
-    memory 5.GB
-    time 1.hour
+    memory 10.GB
+    time 3.hour
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
     container 'quay.io/jaxcompsci/py3_perl_pylibs:v2'

diff --git a/...s/g2gtools/g2gtools_chain_convert_peak.nf → modules/g2gtools/g2gtools_chain_convert.nf b/...s/g2gtools/g2gtools_chain_convert_peak.nf → modules/g2gtools/g2gtools_chain_convert.nf
diff --git a/modules/g2gtools/g2gtools_vci_convert.nf b/modules/g2gtools/g2gtools_vci_convert.nf
@@ -0,0 +1,26 @@
+process VCI_CONVERT {
+  tag "$sampleID"
+
+  cpus 1
+  memory 10.GB
+  time '10:00:00'
+  errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
+
+  container 'quay.io/jaxcompsci/g2gtools:2.0.1'
+
+  publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'g2gtools' }", pattern: "*.log", mode:'copy'
+
+  input:
+  tuple val(sampleID), file(bam_shifted)
+
+  output:
+  tuple val(sampleID), file("*.tmp.mm10.bam"), emit: converted_bam
+  tuple val(sampleID), file("*g2gconvert.log"), emit: log
+
+  when: params.chain != null
+
+  script:
+  """
+  g2gtools convert -i ${bam_shifted[0]} -c ${params.chain} --file-format bam -r -o ${sampleID}.tmp.mm10.bam 2> ${sampleID}_g2gconvert.log
+  """
+}
diff --git a/modules/gridss/gridss_assemble.nf b/modules/gridss/gridss_assemble.nf
@@ -2,7 +2,7 @@ process GRIDSS_ASSEMBLE {
     tag "$sampleID"
 
     cpus = 4
-    memory = 40.GB
+    memory = 100.GB
     time = '20:00:00'
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
@@ -16,8 +16,9 @@ process GRIDSS_ASSEMBLE {
 
     script:
     String my_mem = (task.memory-1.GB).toString()
-    my_mem =  my_mem[0..-4]+'g'
-
+    heap_mem =  my_mem[0..-4]+'g'
+    other_mem = my_mem[0..-30]+'g'
+
     output_dir = 'gridss_assemble/'
 
     """
@@ -31,7 +32,8 @@ process GRIDSS_ASSEMBLE {
     fi
 
     gridss \
-    --jvmheap "${my_mem}" \
+    --jvmheap "${heap_mem}" \
+    --otherjvmheap "${other_mem}" \
     --steps assemble \
     --reference "${params.combined_reference_set}" \
     --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \

diff --git a/modules/gridss/gripss_somatic_filter.nf b/modules/gridss/gripss_somatic_filter.nf
@@ -1,11 +1,10 @@
-
 process GRIPSS_SOMATIC_FILTER {
     tag "$sampleID"
 
     cpus = 1
-    memory = 5.GB
+    memory = 30.GB
     time = '01:00:00'
-    errorStrategy 'ignore'
+    errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
     container 'quay.io/biocontainers/hmftools-gripss:2.3.2--hdfd78af_0'
 
@@ -26,7 +25,7 @@ process GRIPSS_SOMATIC_FILTER {
 
     script:
     """
-    gripss -Xmx5g \
+    gripss -Xmx29g \
         -sample ${tumor_name} \
         -reference ${normal_name} \
         -ref_genome_version 38 \

diff --git a/modules/illumina/manta.nf b/modules/illumina/manta.nf
@@ -2,8 +2,8 @@ process MANTA {
   tag "$sampleID"
 
   cpus = 4
-  memory { normal_bam.size() < 60.GB ? 12.GB : 24.GB }
-  time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' }
+  memory 24.GB
+  time '12:00:00'
   errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
   container 'quay.io/jaxcompsci/manta:v1.5.0'