From 7d8c7129510cf1271f0a731465be3662e8ee3ac4 Mon Sep 17 00:00:00 2001
From: Mike Lloyd <mike.lloyd@jax.org>
Date: Fri, 6 Sep 2024 09:35:30 -0400
Subject: [PATCH 1/3] mem and time adjust

---
 modules/gridss/gridss_assemble.nf                       | 4 ++--
 modules/python/python_somatic_vcf_finalization.nf       | 2 +-
 modules/python/python_somatic_vcf_finalization_mouse.nf | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/gridss/gridss_assemble.nf b/modules/gridss/gridss_assemble.nf
index 2524c59..d6e04dd 100644
--- a/modules/gridss/gridss_assemble.nf
+++ b/modules/gridss/gridss_assemble.nf
@@ -2,7 +2,7 @@ process GRIDSS_ASSEMBLE {
     tag "$sampleID"
 
     cpus = 4
-    memory = 40.GB
+    memory = 50.GB
     time = '20:00:00'
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
@@ -15,7 +15,7 @@ process GRIDSS_ASSEMBLE {
     tuple val(sampleID), path('gridss_assemble/'), emit: gridss_assembly
 
     script:
-    String my_mem = (task.memory-1.GB).toString()
+    String my_mem = (task.memory-5.GB).toString()
     my_mem =  my_mem[0..-4]+'g'
     
     output_dir = 'gridss_assemble/'
diff --git a/modules/python/python_somatic_vcf_finalization.nf b/modules/python/python_somatic_vcf_finalization.nf
index 247e37a..379c71b 100644
--- a/modules/python/python_somatic_vcf_finalization.nf
+++ b/modules/python/python_somatic_vcf_finalization.nf
@@ -3,7 +3,7 @@ process SOMATIC_VCF_FINALIZATION {
 
     cpus 1
     memory 50.GB
-    time 1.hour
+    time '3:00:00'
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
     container 'quay.io/jaxcompsci/py3_perl_pylibs:v2'
diff --git a/modules/python/python_somatic_vcf_finalization_mouse.nf b/modules/python/python_somatic_vcf_finalization_mouse.nf
index 9c73f4b..89a93c8 100644
--- a/modules/python/python_somatic_vcf_finalization_mouse.nf
+++ b/modules/python/python_somatic_vcf_finalization_mouse.nf
@@ -3,7 +3,7 @@ process SOMATIC_VCF_FINALIZATION {
 
     cpus 1
     memory 50.GB
-    time 1.hour
+    time '3:00:00'
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
     container 'quay.io/jaxcompsci/py3_perl_pylibs:v2'

From 67b52a0229de6f88dbc0bd44fdfcc4acfb4d8ce5 Mon Sep 17 00:00:00 2001
From: Mike Lloyd <mike.lloyd@jax.org>
Date: Thu, 26 Sep 2024 09:46:44 -0400
Subject: [PATCH 2/3] v0.7.3 updates

---
 ReleaseNotes.md                               | 60 +++++++++++++++
 bin/gbrs/generate_emission_prob_avecs.py      | 24 +++---
 bin/help/atac.nf                              |  4 +-
 bin/log/atac.nf                               |  2 +
 bin/pta/annotate-bedpe-with-cnv.r             | 18 +++--
 bin/pta/annotate-cnv-delly.r                  | 16 +++-
 bin/pta/delly_cnv_plot.r                      | 17 ++++-
 bin/shared/extract_csv.nf                     |  5 ++
 ...dd_cancer_resistance_mutations_germline.nf |  4 +-
 ...vert_peak.nf => g2gtools_chain_convert.nf} |  0
 modules/g2gtools/g2gtools_vci_convert.nf      | 26 +++++++
 modules/gridss/gridss_assemble.nf             | 12 +--
 modules/gridss/gripss_somatic_filter.nf       |  7 +-
 modules/illumina/manta.nf                     |  4 +-
 modules/picard/picard_mergesamfiles.nf        |  4 +-
 modules/r/plot_delly_cnv.nf                   |  2 +-
 .../samtools_chain_sort_fixmate_bam.nf        |  6 +-
 .../samtools/samtools_non_chain_reindex.nf    |  4 +-
 modules/samtools/samtools_stats_insertsize.nf |  2 +-
 modules/svaba/svaba.nf                        |  4 +-
 nextflow.config                               |  4 +-
 subworkflows/aria_download_parse.nf           | 16 +++-
 subworkflows/concatenate_local_files.nf       | 22 ++++--
 workflows/atac.nf                             | 73 ++++++++++++++++---
 24 files changed, 270 insertions(+), 66 deletions(-)
 rename modules/g2gtools/{g2gtools_chain_convert_peak.nf => g2gtools_chain_convert.nf} (100%)
 create mode 100644 modules/g2gtools/g2gtools_vci_convert.nf

diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index 6c4164b..daad9dc 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -1,5 +1,65 @@
 # RELEASE NOTES
 
+
+## Release 0.7.3
+
+In this release we make a updates to the ATAC workflow, and correct issues related to the PTA workflow.  
+
+ATAC:  
+* Merging of replicate samples is now supported. Use the `--merge_replicates` option, along with a CSV input file. See the [wiki page](https://github.com/TheJacksonLaboratory/cs-nf-pipelines/wiki/ATAC-Pipeline-ReadMe#csv-input-sample-sheet) for details on CSV setup.  
+* GRCm39 pseudo-references generated with G2Gtools are now supported. Previously, GRCm38 was supported via the `--chain` option. For GRCm39, VCI files are required input also specified with `--chain`
+
+PTA:  
+* The mouse PTA workflow would crash when all somatic CNVs were filtered, we have corrected this.  
+* Numerous adjustments to adjustments to memory and wall clock limits were made to support high coverage WGS data.  
+
+### Pipelines Added:
+
+None  
+
+### Modules Added:
+
+1. modules/g2gtools/g2gtools_vci_convert.nf  
+
+### Pipeline Changes:
+
+1. workflows/atac.nf: Replicate merging added. GRCm39 pseudo-reference support added. 
+1. subworkflows/aria_download_parse.nf: Support for replicate merging added. 
+1. subworkflows/concatenate_local_files.nf: Support for replicate merging added. 
+
+### Module Changes:
+
+1. modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf: wallclock and memory request increase.  
+1. modules/gridss/gridss_assemble.nf: memory request increase, and java heap adjustment.
+1. modules/gridss/gripss_somatic_filter.nf: memory request increase, and java heap adjustment. 
+1. modules/illumina/manta.nf: memory and wallclock requests were made flat rather than scaled to input file size. 
+1. modules/picard/picard_mergesamfiles.nf: correct `file` vs. `path` nextflow issue.
+1. modules/python/python_somatic_vcf_finalization.nf: wallclock requests increase.
+1. modules/python/python_somatic_vcf_finalization_mouse.nf: wallclock requests increase.
+1. modules/r/plot_delly_cnv.nf: add dynamic plot naming based on `sampleID`
+1. modules/samtools/samtools_chain_sort_fixmate_bam.nf: alter module to re-sort final filtered BAM prior to possible replicate merge.
+1. modules/samtools/samtools_non_chain_reindex.nf: alter module to re-sort final filtered BAM prior to possible replicate merge.
+1. modules/samtools/samtools_stats_insertsize.nf: wallclock request increase.  
+1. modules/svaba/svaba.nf: memory and wallclock requests increase.  
+
+### Scripts Added:
+
+None
+
+### Script Changes:
+
+1. bin/gbrs/generate_emission_prob_avecs.py: Modify for use with non-DO strain IDs and dynamic number of strains.
+1. bin/pta/annotate-bedpe-with-cnv.r: Capture edge case where all somatic CNV are filtered. 
+1. bin/pta/annotate-cnv-delly.r: Capture edge case where all somatic CNV are filtered. 
+1. bin/pta/delly_cnv_plot.r: Capture edge case where all somatic CNV are filtered. 
+
+
+### NF-Test Modules Added: 
+
+None
+
+
+
 ## Release 0.7.2
 
 In this minor release we correct a bug in `--workflow atac`. In this workflow, the `macs2` module was configured to use a user defined parameter `tmpdir` for scratch space. However, if the specified `tmpdir` did not exist, `macs2` would fail silently, and allow the workflow to continue. This behavior has been fixed.   
diff --git a/bin/gbrs/generate_emission_prob_avecs.py b/bin/gbrs/generate_emission_prob_avecs.py
index 0e38983..a54fb5c 100644
--- a/bin/gbrs/generate_emission_prob_avecs.py
+++ b/bin/gbrs/generate_emission_prob_avecs.py
@@ -20,9 +20,12 @@
 parser.add_option("-g", "--gene2transcripts_file", dest="gene2transcripts",
                   help="the emase gene2transcripts file from the 'prepare_emase' workflow", metavar="emase.gene2transcripts.tsv")
 
-parser.add_option("-m", "--metadata_file", dest="metadata_file", default="A, B, C, D, E, F, G, H",
+parser.add_option("-m", "--metadata_file", dest="metadata_file",
                   help="comma delimited metadata file. File must have headers 'do_id' and 'sampleID' where: do_id is ('A', 'B', ..., 'G', 'H'), and sampleID are the IDs produced and used by 'run_emase' (i.e., output directory names) ", metavar="metadata.csv")
 
+parser.add_option("-s", "--strains", dest="strains", default="A,B,C,D,E,F,G,H",
+                  help="comma delimited list of strains", metavar="A,B,C,D,E,F,G,H")
+
 parser.add_option("-e", "--min_expression", dest="min_expression", default="2",
                   help="minimum expression count to include a gene in Avec computation.", metavar="EXP_MIN")
 
@@ -64,7 +67,7 @@ def unit_vector(vector):
     header = fh.readline().rstrip().split(",")
 
     try:
-        res = [header.index(i) for i in ['do_id', 'sampleID']]
+        res = [header.index(i) for i in ['hap_id', 'sampleID']]
     except Exception as e:
         print('Error processing metadata file:', e)
 
@@ -75,7 +78,7 @@ def unit_vector(vector):
 
 sample_id = dict(sample_id)
 
-print('Found the following keys in "do_id": ' + str(sample_id.keys()))
+print('Found the following keys in "hap_id": ' + str(sample_id.keys()))
 
 print('The total number of samples by strain in the metadata file is: ')
 
@@ -84,7 +87,8 @@ def unit_vector(vector):
 
 print('Found the following sample counts matched to sample IDs in the input directory...')
 
-strains = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H') ### This line will require generalization for non DO data.
+strains = [i.strip() for i in options.strains.split(',')]
+
 num_strains = len(strains)
 
 dlist = defaultdict(list)
@@ -112,7 +116,7 @@ def unit_vector(vector):
             for curline in fh:
                 item = curline.rstrip().split("\t")
                 row = gid[item[0]]
-                dmat_sample[row, :] = list(map(float, item[1:9]))
+                dmat_sample[row, :] = list(map(float, item[1:num_strains+1]))
         dmat_strain += dmat_sample
     dset[st] = dmat_strain / len(dlist[st])
 
@@ -133,9 +137,9 @@ def unit_vector(vector):
 
 for g in gname:
     all_genes.add(g)
-    axes[g] = np.zeros((8, 8))
-    ases[g] = np.zeros((1, 8))
-    good = np.zeros(8)
+    axes[g] = np.zeros((num_strains, num_strains))
+    ases[g] = np.zeros((1, num_strains))
+    good = np.zeros(num_strains)
     for i, st in enumerate(strains):
         v = dset[st][gid[g], :]
         axes[g][i, :] = v
@@ -143,9 +147,9 @@ def unit_vector(vector):
         if sum(v) > min_expr:
             good[i] = 1.0
     if sum(good) > 0:  # At least one strain expresses
-        avecs[g] = np.zeros((8, 8))
+        avecs[g] = np.zeros((num_strains, num_strains))
         passed_genes.add(g)
-        for i in range(8):
+        for i in range(num_strains):
             avecs[g][i, :] = unit_vector(axes[g][i, :])
 
 with open(os.path.join(options.output_directory, options.output_prefix + '.included_genes.txt'), 'w') as f:
diff --git a/bin/help/atac.nf b/bin/help/atac.nf
index 51717a4..9f47314 100644
--- a/bin/help/atac.nf
+++ b/bin/help/atac.nf
@@ -12,12 +12,14 @@ Parameter | Default | Description
 --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz
 --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE).
 --concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes.
---csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified.
+--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,[replicate],fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Replicate is optional and used only when `--merge_replicates` is used. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified.
 --download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. 
 
 --gen_org | mouse | Options: mouse or human.
 --genome_build | 'GRCm38' | Options: GRCm38 or GRCm39
 
+--merge_replicates | false | If specified, replicates will be merged based on the 'replicate' field in the CSV input file. 
+
 --effective_genome_size | The length of the “mappable” genome. | See : 'https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html'.
                         
 --chain | null | The default value for Mouse Reference Strain -  g2gtools chain file to adjust coordinates to reference                                                              
diff --git a/bin/log/atac.nf b/bin/log/atac.nf
index c724e0c..9ec4640 100644
--- a/bin/log/atac.nf
+++ b/bin/log/atac.nf
@@ -27,6 +27,7 @@ ______________________________________________________
 -c                              ${params.config}
 --pubdir                        ${params.pubdir}
 --organize_by                   ${params.organize_by}
+--merge_replicates              ${params.merge_replicates}
 --bowtie2Index                  ${params.bowtie2Index}
 --bowtieMaxInsert               ${params.bowtieMaxInsert}
 --bowtieVSensitive              ${params.bowtieVSensitive}
@@ -63,6 +64,7 @@ ______________________________________________________
 -c                              ${params.config}
 --pubdir                        ${params.pubdir}
 --organize_by                   ${params.organize_by}
+--merge_replicates              ${params.merge_replicates}
 --effective_genome_size         ${params.effective_genome_size} 
 --chain                         ${params.chain}
 --bowtie2Index                  ${params.bowtie2Index}
diff --git a/bin/pta/annotate-bedpe-with-cnv.r b/bin/pta/annotate-bedpe-with-cnv.r
index 982b9b9..7555696 100644
--- a/bin/pta/annotate-bedpe-with-cnv.r
+++ b/bin/pta/annotate-bedpe-with-cnv.r
@@ -33,10 +33,18 @@ readBEDPE = function(f) {
 ## Read a headered, tab-delimited CNV file into a GRanges object
 readCNV = function(f) {
   
-  x = read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#')
-  colnames(x)[1:3] = c('chr','start','end')
-  x = makeGRangesFromDataFrame(x)
-  
+  x <- tryCatch( 
+    {
+        read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#')
+        colnames(x)[1:3] = c('chr','start','end')
+        x = makeGRangesFromDataFrame(x)
+    },
+    error = function(e) {
+        GRanges()
+    }
+  )
+  # if CNV CSV is empty (i.e., no somatic CNV), return empty GRanges object.
+
   return(x)
   
 }
@@ -127,8 +135,6 @@ option_list = list(
   make_option(c("-o", "--out_file"), type='character',  help="Output BEDPE"))
 opt = parse_args(OptionParser(option_list=option_list))
 
-
-
 ## Read bedpe
 sv = readBEDPE(opt$bedpe)
 
diff --git a/bin/pta/annotate-cnv-delly.r b/bin/pta/annotate-cnv-delly.r
index 828500c..5aad94b 100644
--- a/bin/pta/annotate-cnv-delly.r
+++ b/bin/pta/annotate-cnv-delly.r
@@ -219,7 +219,7 @@ annotateEnsembl = function(x, ens, closest.max.distance=CLOSEST_MAX_DISTANCE) {
 
 ## Collect arguments
 option_list = list(
-  make_option(c("-c", "--cnv"),                   type='character', help="Input CNV calles"),
+  make_option(c("-c", "--cnv"),                   type='character', help="Input CNV calls"),
   make_option(c("-a", "--caller"),                type='character', help="Name of tool used to call CNVs in --cnv (only delly is currently supported)"),
   make_option(c("-t", "--tumor"),                 type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"),
   make_option(c("-n", "--normal"),                type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"),
@@ -241,7 +241,19 @@ opt$allowed_chr = unlist(strsplit(opt$allowed_chr, ',', fixed=T))
 
 
 ## Read files
-cnv = readCNV(opt$cnv, chr=opt$allowed_chr)
+cnv <- tryCatch( 
+  {
+      readCNV(opt$cnv, chr=opt$allowed_chr)
+  },
+  error = function(e) {
+    empty_df <- setNames(data.frame(matrix(ncol = 10, nrow = 0)), c('#chr', 'start', 'end', 'type', 'log2', 'tool', 'tumor--normal', 'info', 'focal', 'cytoband'))
+    
+    write.table(empty_df, opt$out_file_main, row.names=F, col.names=T, sep='\t', quote=F)
+    write.table(empty_df, opt$out_file_supplemental, row.names=F, col.names=T, sep='\t', quote=F)
+
+    quit(status=0, save='no')
+  }
+)
 cyto = readCytoband(opt$cytoband)
 ensembl = readEnsembl(opt$ensembl)
 
diff --git a/bin/pta/delly_cnv_plot.r b/bin/pta/delly_cnv_plot.r
index cfaa6b0..da6bb75 100644
--- a/bin/pta/delly_cnv_plot.r
+++ b/bin/pta/delly_cnv_plot.r
@@ -12,7 +12,18 @@ minCN = 0
 maxCN = 8
 seg = data.frame()
 if (length(args)>1) {
-   seg = read.table(args[2], header=F, sep="\t")
+   seg <- tryCatch( 
+      {
+         read.table(args[2], header=F, sep="\t")
+      },
+      error = function(e) {
+         p <- ggplot() + 
+            annotate("text", x = 0.5, y = 0.5, label = "No somatic CNV segments", size = 10, hjust = 0.5, vjust = 0.5) + 
+            theme_void()
+         ggsave(p, file=paste0(args[3], ".plot.wholegenome.png"), width=24, height=6)
+         quit(status=0, save='no')
+      }
+   )
    colnames(seg) = c("chr", "start", "end", "id", "cn")
 }
 
@@ -36,7 +47,7 @@ p = p + facet_grid(. ~ chr, scales="free_x", space="free_x")
 p = p + ylim(minCN, maxCN)
 p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
 p = p + ggtitle(args[1])
-ggsave(p, file="plot.wholegenome.png", width=24, height=6)
+ggsave(p, file=paste0(args[3], ".plot.wholegenome.png"), width=24, height=6)
 print(warnings())
 
 # By chromosome
@@ -51,7 +62,7 @@ for(chrname in unique(x$chr)) {
  if (nrow(sl)) { p = p + geom_segment(data=sl, aes(x=start, y=cn, xend=end, yend=cn), color="#3831a3", size=1.2); }
  p = p + ylim(minCN, maxCN)
  p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
- ggsave(p, file=paste0("plot.", chrname, ".png"), width=24, height=6)
+ ggsave(p, file=paste0(args[3], ".plot.", chrname, ".png"), width=24, height=6)
  print(warnings())
 }
 
diff --git a/bin/shared/extract_csv.nf b/bin/shared/extract_csv.nf
index 46b7045..44403db 100644
--- a/bin/shared/extract_csv.nf
+++ b/bin/shared/extract_csv.nf
@@ -43,6 +43,11 @@ def extract_csv(csv_file) {
         if (row.lane) meta.lane = row.lane.toString()
         else meta.lane = 'NA'
 
+        // If no replicate specified, replicate is not considered
+        if (row.replicate) meta.replicate = row.replicate.toString()
+        else meta.replicate = 'NA'
+        
+
         /* 
             NOTE: Additional metadata parsing could be added here. This function is a minimal implimentation of a csv parser. 
         */
diff --git a/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf
index 3b4063f..d5e6275 100644
--- a/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf
+++ b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf
@@ -2,8 +2,8 @@ process COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE {
     tag "$sampleID"
 
     cpus 1
-    memory 5.GB
-    time 1.hour
+    memory 10.GB
+    time 3.hour
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
     container 'quay.io/jaxcompsci/py3_perl_pylibs:v2'
diff --git a/modules/g2gtools/g2gtools_chain_convert_peak.nf b/modules/g2gtools/g2gtools_chain_convert.nf
similarity index 100%
rename from modules/g2gtools/g2gtools_chain_convert_peak.nf
rename to modules/g2gtools/g2gtools_chain_convert.nf
diff --git a/modules/g2gtools/g2gtools_vci_convert.nf b/modules/g2gtools/g2gtools_vci_convert.nf
new file mode 100644
index 0000000..8d5b828
--- /dev/null
+++ b/modules/g2gtools/g2gtools_vci_convert.nf
@@ -0,0 +1,26 @@
+process VCI_CONVERT {
+  tag "$sampleID"
+
+  cpus 1
+  memory 10.GB
+  time '10:00:00'
+  errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
+
+  container 'quay.io/jaxcompsci/g2gtools:2.0.1'
+
+  publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'g2gtools' }", pattern: "*.log", mode:'copy'
+
+  input:
+  tuple val(sampleID), file(bam_shifted)
+
+  output:
+  tuple val(sampleID), file("*.tmp.mm10.bam"), emit: converted_bam
+  tuple val(sampleID), file("*g2gconvert.log"), emit: log
+
+  when: params.chain != null
+
+  script:
+  """
+  g2gtools convert -i ${bam_shifted[0]} -c ${params.chain} --file-format bam -r -o ${sampleID}.tmp.mm10.bam 2> ${sampleID}_g2gconvert.log
+  """
+}
diff --git a/modules/gridss/gridss_assemble.nf b/modules/gridss/gridss_assemble.nf
index d6e04dd..465579d 100644
--- a/modules/gridss/gridss_assemble.nf
+++ b/modules/gridss/gridss_assemble.nf
@@ -2,7 +2,7 @@ process GRIDSS_ASSEMBLE {
     tag "$sampleID"
 
     cpus = 4
-    memory = 50.GB
+    memory = 100.GB
     time = '20:00:00'
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
@@ -15,9 +15,10 @@ process GRIDSS_ASSEMBLE {
     tuple val(sampleID), path('gridss_assemble/'), emit: gridss_assembly
 
     script:
-    String my_mem = (task.memory-5.GB).toString()
-    my_mem =  my_mem[0..-4]+'g'
-    
+    String my_mem = (task.memory-1.GB).toString()
+    heap_mem =  my_mem[0..-4]+'g'
+    other_mem = my_mem[0..-30]+'g'
+
     output_dir = 'gridss_assemble/'
 
     """
@@ -31,7 +32,8 @@ process GRIDSS_ASSEMBLE {
     fi
 
     gridss \
-    --jvmheap "${my_mem}" \
+    --jvmheap "${heap_mem}" \
+    --otherjvmheap "${other_mem}" \
     --steps assemble \
     --reference "${params.combined_reference_set}" \
     --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \
diff --git a/modules/gridss/gripss_somatic_filter.nf b/modules/gridss/gripss_somatic_filter.nf
index bfb9d5a..dc8453a 100644
--- a/modules/gridss/gripss_somatic_filter.nf
+++ b/modules/gridss/gripss_somatic_filter.nf
@@ -1,11 +1,10 @@
-
 process GRIPSS_SOMATIC_FILTER {
     tag "$sampleID"
 
     cpus = 1
-    memory = 5.GB
+    memory = 30.GB
     time = '01:00:00'
-    errorStrategy 'ignore'
+    errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
     
     container 'quay.io/biocontainers/hmftools-gripss:2.3.2--hdfd78af_0'
 
@@ -26,7 +25,7 @@ process GRIPSS_SOMATIC_FILTER {
 
     script:
     """
-    gripss -Xmx5g \
+    gripss -Xmx29g \
         -sample ${tumor_name} \
         -reference ${normal_name} \
         -ref_genome_version 38 \
diff --git a/modules/illumina/manta.nf b/modules/illumina/manta.nf
index a960123..f1be49f 100644
--- a/modules/illumina/manta.nf
+++ b/modules/illumina/manta.nf
@@ -2,8 +2,8 @@ process MANTA {
   tag "$sampleID"
 
   cpus = 4
-  memory { normal_bam.size() < 60.GB ? 12.GB : 24.GB }
-  time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' }
+  memory 24.GB
+  time '12:00:00'
   errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
   container 'quay.io/jaxcompsci/manta:v1.5.0'
diff --git a/modules/picard/picard_mergesamfiles.nf b/modules/picard/picard_mergesamfiles.nf
index ac52496..26f7472 100644
--- a/modules/picard/picard_mergesamfiles.nf
+++ b/modules/picard/picard_mergesamfiles.nf
@@ -14,10 +14,10 @@ process PICARD_MERGESAMFILES {
 
 
     input:
-    tuple val(sampleID), file(bam)
+    tuple val(sampleID), path(bam)
 
     output:
-    tuple val(sampleID), file("*.bam"), emit: bam
+    tuple val(sampleID), path("*.bam"), emit: bam
 
     script:
     String my_mem = (task.memory-1.GB).toString()
diff --git a/modules/r/plot_delly_cnv.nf b/modules/r/plot_delly_cnv.nf
index e7bd18c..5027202 100644
--- a/modules/r/plot_delly_cnv.nf
+++ b/modules/r/plot_delly_cnv.nf
@@ -18,6 +18,6 @@ process PLOT_DELLY_CNV {
 
     script:
     """
-    Rscript ${projectDir}/bin/pta/delly_cnv_plot.r ${cov} ${seg_bed}
+    Rscript ${projectDir}/bin/pta/delly_cnv_plot.r ${cov} ${seg_bed} ${sampleID}
     """
 }
diff --git a/modules/samtools/samtools_chain_sort_fixmate_bam.nf b/modules/samtools/samtools_chain_sort_fixmate_bam.nf
index 40aa920..2c71b9d 100644
--- a/modules/samtools/samtools_chain_sort_fixmate_bam.nf
+++ b/modules/samtools/samtools_chain_sort_fixmate_bam.nf
@@ -32,7 +32,7 @@ process CHAIN_SORT_FIXMATE_BAM {
   samtools fixmate \
   -O bam ${sampleID}.tmp3.mm10.bam ${sampleID}.tmp4.mm10.bam
 
-  # re-sort bam by coordinates
+  # re-sort bam by coordinates. This step is required for the MT filter to work properly
   samtools sort \
   -@ $task.cpus -O bam \
   -o ${sampleID}.tmp5.mm10.bam ${sampleID}.tmp4.mm10.bam
@@ -44,9 +44,11 @@ process CHAIN_SORT_FIXMATE_BAM {
   -h 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y \
   | grep -ve 'SN:MT*' > tmp.sam
 
-  samtools view -b tmp.sam \
+  # re-sort following MT removal. This is required for PICARD MERGE
+  samtools sort -@ $task.cpus tmp.sam \
   > ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam
 
+  # Index BAM
   samtools index \
   ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam
   """
diff --git a/modules/samtools/samtools_non_chain_reindex.nf b/modules/samtools/samtools_non_chain_reindex.nf
index 2e2cf3f..f024be5 100644
--- a/modules/samtools/samtools_non_chain_reindex.nf
+++ b/modules/samtools/samtools_non_chain_reindex.nf
@@ -29,9 +29,11 @@ process NON_CHAIN_REINDEX {
   -h 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y \
   | grep -ve 'SN:MT*\\|SN:GL*\\|SN:JH:*' > tmp.sam
 
-  samtools view -b tmp.sam \
+  # Re-sort BAM following MT removal. This is required for PICARD MERGE
+  samtools sort -@ $task.cpus tmp.sam \
   > ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam
 
+  # Index BAM
   samtools index \
   ${sampleID}.sorted.rmDup.rmChrM.rmMulti.filtered.shifted.mm10.bam
   """
diff --git a/modules/samtools/samtools_stats_insertsize.nf b/modules/samtools/samtools_stats_insertsize.nf
index 4d8b5e5..0a7e2f8 100644
--- a/modules/samtools/samtools_stats_insertsize.nf
+++ b/modules/samtools/samtools_stats_insertsize.nf
@@ -3,7 +3,7 @@ process SAMTOOLS_STATS_INSERTSIZE {
 
     cpus 8
     memory 1.GB
-    time '01:00:00'
+    time '02:00:00'
     errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
     container 'quay.io/biocontainers/samtools:1.14--hb421002_0'
diff --git a/modules/svaba/svaba.nf b/modules/svaba/svaba.nf
index 109d984..c1d198f 100644
--- a/modules/svaba/svaba.nf
+++ b/modules/svaba/svaba.nf
@@ -2,8 +2,8 @@ process SVABA {
   tag "$sampleID"
 
   cpus = 8
-  memory { normal_bam.size() < 60.GB ? 15.GB : 48.GB }
-  time { normal_bam.size() < 60.GB ? '10:00:00' : '24:00:00' }
+  memory { normal_bam.size() < 60.GB ? 30.GB : 48.GB }
+  time { normal_bam.size() < 60.GB ? '18:00:00' : '24:00:00' }
   errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
 
   container 'quay.io/jaxcompsci/svaba:v0.2.1'
diff --git a/nextflow.config b/nextflow.config
index 181497d..89ac3b2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -21,7 +21,7 @@ params {
     keep_intermediate = false // true
     fastq2 = true // default is PE for workflows
     tmpdir = "/flashscratch/${USER}"  // generic param
-
+    merge_replicates = false
     // get help
     help = null
 
@@ -45,7 +45,7 @@ manifest {
     homePage = "https://github.com/TheJacksonLaboratory/cs-nf-pipelines"
     mainScript = "main.nf"
     nextflowVersion = "!>=22.04.3"
-    version = "0.7.2"
+    version = "0.7.3"
     author = 'Michael Lloyd, Brian Sanderson, Barry Guglielmo, Sai Lek, Peter Fields, Harshpreet Chandok, Carolyn Paisie, Gabriel Rech, Ardian Ferraj, Tejas Temker, Anuj Srivastava. Copyright Jackson Laboratory 2024'
 }
 
diff --git a/subworkflows/aria_download_parse.nf b/subworkflows/aria_download_parse.nf
index d756c6a..b2e06d1 100644
--- a/subworkflows/aria_download_parse.nf
+++ b/subworkflows/aria_download_parse.nf
@@ -28,15 +28,25 @@ workflow FILE_DOWNLOAD {
         if (params.read_type == 'PE') {
             aria_download_input = ch_input_sample
             .multiMap { it ->
-                R1: tuple(it[0], it[1], 'R1', it[2])
-                R2: tuple(it[0], it[1], 'R2', it[3])
+                if (params.merge_replicates) {
+                    sampleID   = it[1].sampleID+'_'+it[1].replicate
+                } else {
+                    sampleID   = it[1].sampleID
+                }
+                R1: tuple(sampleID, it[1], 'R1', it[2])
+                R2: tuple(sampleID, it[1], 'R2', it[3])
             }
             .mix()
             group_size = 2
         } else {
             aria_download_input = ch_input_sample
             .multiMap { it ->
-                R1: tuple(it[0], it[1], 'R1', it[2])
+                if (params.merge_replicates) {
+                    sampleID   = it[1].sampleID+'_'+it[1].replicate
+                } else {
+                    sampleID   = it[1].sampleID
+                }
+                R1: tuple(sampleID, it[1], 'R1', it[2])
             }
             .mix()
             group_size = 1
diff --git a/subworkflows/concatenate_local_files.nf b/subworkflows/concatenate_local_files.nf
index 36fb8a3..9f9045a 100644
--- a/subworkflows/concatenate_local_files.nf
+++ b/subworkflows/concatenate_local_files.nf
@@ -15,9 +15,15 @@ workflow CONCATENATE_LOCAL_FILES {
             temp_map = ch_input_sample
             .multiMap { it ->
                 def meta = [:]
-                meta.sampleID   = it[1].sampleID
-                R1: tuple(it[0], it[1].lane, meta, 'R1', it[2])
-                R2: tuple(it[0], it[1].lane, meta, 'R2', it[3])
+                if (params.merge_replicates) {
+                    meta.sampleID   = it[1].replicate != 'NA' ? it[1].sampleID+'_'+it[1].replicate : it[1].sampleID
+                    meta.replicate  = it[1].replicate
+                    meta.baseSampleID = it[1].sampleID
+                } else {
+                    meta.sampleID   = it[1].sampleID
+                }
+                R1: tuple(meta.sampleID, meta.lane, meta, 'R1', it[2])
+                R2: tuple(meta.sampleID, meta.lane, meta, 'R2', it[3])
             }
             .mix()
             .groupTuple(by: [0,2,3])
@@ -34,8 +40,14 @@ workflow CONCATENATE_LOCAL_FILES {
             temp_map = ch_input_sample
             .multiMap { it ->
                 def meta = [:]
-                meta.sampleID   = it[1].sampleID
-                R1: tuple(it[0], it[1].lane, meta, 'R1', it[2])
+                if (params.merge_replicates) {
+                    meta.sampleID   = it[1].replicate != 'NA' ? it[1].sampleID+'_'+it[1].replicate : it[1].sampleID
+                    meta.replicate  = it[1].replicate
+                    meta.baseSampleID = it[1].sampleID
+                } else {
+                    meta.sampleID   = it[1].sampleID
+                }
+                R1: tuple(meta.sampleID, meta.lane, meta, 'R1', it[2])
             }
             .groupTuple(by: [0,2,3])
             .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1], file
diff --git a/workflows/atac.nf b/workflows/atac.nf
index 5f6e56a..1d2c21e 100755
--- a/workflows/atac.nf
+++ b/workflows/atac.nf
@@ -22,12 +22,16 @@ include {REMOVE_DUPLICATE_READS} from "${projectDir}/modules/samtools/samtools_r
 include {CALC_MTDNA_FILTER_CHRM} from "${projectDir}/modules/samtools/samtools_calc_mtdna_filter_chrm"
 include {FILTER_REMOVE_MULTI_SHIFT} from "${projectDir}/modules/samtools/samtools_filter_remove_multi_shift"
 include {FILTER_REMOVE_MULTI_SIEVE} from "${projectDir}/modules/deeptools/deeptools_filter_remove_multi_sieve"
-include {CHAIN_CONVERT} from "${projectDir}/modules/g2gtools/g2gtools_chain_convert_peak"
+include {CHAIN_CONVERT} from "${projectDir}/modules/g2gtools/g2gtools_chain_convert"
+include {VCI_CONVERT} from "${projectDir}/modules/g2gtools/g2gtools_vci_convert"
 include {CHAIN_EXTRACT_BADREADS} from "${projectDir}/modules/gatk/gatk_chain_extract_badreads"
 include {CHAIN_BAD2UNIQ_READS} from "${projectDir}/modules/samtools/samtools_chain_bad2uniq_reads"
 include {CHAIN_FILTER_READS} from "${projectDir}/modules/gatk/gatk_chain_filter_reads"
 include {CHAIN_SORT_FIXMATE_BAM} from "${projectDir}/modules/samtools/samtools_chain_sort_fixmate_bam"
 include {NON_CHAIN_REINDEX} from "${projectDir}/modules/samtools/samtools_non_chain_reindex"
+include {SAMTOOLS_INDEX;
+         SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED} from "${projectDir}/modules/samtools/samtools_index"
+include {PICARD_MERGESAMFILES} from "${projectDir}/modules/picard/picard_mergesamfiles"
 include {PEAK_CALLING} from "${projectDir}/modules/macs2/macs2_peak_calling"
 include {BAM_COVERAGE_BIGWIG} from "${projectDir}/modules/deeptools/deeptools_bam_coverage_bigwig"
 include {FRIP_READS_IN_PEAKS} from "${projectDir}/modules/bedtools/bedtools_frip_reads_in_peaks"
@@ -54,6 +58,18 @@ if (params.download_data && !params.csv_input) {
     exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information."
 }
 
+if (params.merge_replicates && !params.csv_input) {
+    exit 1, "Replicate merging was requested with `--merge_replicates`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--merge_replicates` requires a CSV manifest. See `--help` for information."
+}
+
+if (!(params.genome_build in ['GRCm38', 'GRCm39', 'GRCh38'])) {
+  exit 1, "Invalid genome build specified. Please use one of the following: GRCm38, GRCm39, GRCh38."
+}
+
+if (params.gen_org == 'human' && params.genome_build != 'GRCh38') {
+  exit 1, "Invalid genome build specified for human. Please use GRCh38."
+}
+
 // prepare reads channel
 if (params.csv_input) {
 
@@ -164,11 +180,16 @@ workflow ATAC {
   // If Mouse
   if (params.gen_org=='mouse'){
     // Step 11: Convert peak coordinates
-    //          Step occurs when chain != null || chain != false
-    CHAIN_CONVERT(SORT_SHIFTED_BAM.out.sorted_file)
-
+    // Chain and VCI steps occurs when chain != null || chain != false
+    if (params.genome_build == 'GRCm38') {
+      CHAIN_CONVERT(SORT_SHIFTED_BAM.out.sorted_file)
+      converted_bam = CHAIN_CONVERT.out.converted_bam
+    } else if (params.genome_build == 'GRCm39') {
+      VCI_CONVERT(SORT_SHIFTED_BAM.out.sorted_file)
+      converted_bam = VCI_CONVERT.out.converted_bam
+    }
     // Step 12: Sort bam by coordinates
-    SORT_LIFTOVER_BAM(CHAIN_CONVERT.out.converted_bam, '-O bam', 'bam')
+    SORT_LIFTOVER_BAM(converted_bam, '-O bam', 'bam')
 
     // Step 13: Extract a list of 'bad reads'
     CHAIN_EXTRACT_BADREADS(SORT_LIFTOVER_BAM.out.sorted_file)
@@ -199,31 +220,59 @@ workflow ATAC {
   
   }
   else if (params.gen_org=='human'){
-    data_ch = SORT_SHIFTED_BAM.out.sorted_file
-  } 
+    SAMTOOLS_INDEX(SORT_SHIFTED_BAM.out.sorted_file)
+    data_ch = SORT_SHIFTED_BAM.out.sorted_file.join(SAMTOOLS_INDEX.out.bai)
+              .map{it -> [it[0], [it[1], it[2]]]}
+  }
+
+  if (params.merge_replicates) {
+    
+    data_ch.join(meta_ch)
+    .map{it -> [it[2].baseSampleID, it[1][0], it[1][1]]}
+    .groupTuple(by: [0])
+    .map { it ->  [ it[0], [it[1], it[2]].flatten() ] }
+    .branch {
+        merge: it[1].size() > 2
+        pass:  it[1].size == 2
+    }
+    .set{merge_ch}
+    // samples with more than 2 replicates will be merged (i.e., [BAM, BAI, BAM, BAI, ...]) 
+    // samples without replicates (i.e., [BAM, BAI]), are passed then merged back to the channel with `mix`
+
+    PICARD_MERGESAMFILES(merge_ch.merge)
+    SAMTOOLS_INDEX_MERGED(PICARD_MERGESAMFILES.out.bam)
+
+    peak_ch = PICARD_MERGESAMFILES.out.bam.join(SAMTOOLS_INDEX_MERGED.out.bai)
+              .map{it -> [it[0], [it[1], it[2]]]}
+              .mix(merge_ch.pass)
+    // samples where merge was not done are added back to the channel with `mix`
+
+  } else {
+      peak_ch = data_ch
+  }
 
   // Step 19: Peak calling
-  PEAK_CALLING(data_ch) 
+  PEAK_CALLING(peak_ch) 
 
   if (params.gen_org=='mouse'){
     // Step 20: deeptools bamCoverage bigwig
-    BAM_COVERAGE_BIGWIG(data_ch)
+    BAM_COVERAGE_BIGWIG(peak_ch)
 
   } 
 
   // Step 21: Fraction of reads in peaks (FRiP)
-  frip_start = data_ch.join(PEAK_CALLING.out.np)
+  frip_start = peak_ch.join(PEAK_CALLING.out.np)
   FRIP_READS_IN_PEAKS(frip_start) 
 
   // Step 22: Final calculate (FRiP)
-  frip_calc = data_ch.join(FRIP_READS_IN_PEAKS.out[0])
+  frip_calc = peak_ch.join(FRIP_READS_IN_PEAKS.out[0])
   FINAL_CALC_FRIP(frip_calc) 
 
   // Step 23: Get coverage in each peak
   PEAK_COVERAGE(PEAK_CALLING.out.np) 
 
   // Step 24: Feature counts
-  feature_counts = data_ch.join(PEAK_COVERAGE.out)
+  feature_counts = peak_ch.join(PEAK_COVERAGE.out)
   FEATURE_COUNTS(feature_counts) 
 
   if (params.gen_org=='mouse'){

From fec7d0f3fbe9e46a923eb87c43f10259933b7d82 Mon Sep 17 00:00:00 2001
From: Mike Lloyd <mike.lloyd@jax.org>
Date: Thu, 26 Sep 2024 09:49:02 -0400
Subject: [PATCH 3/3] add max rss and vmem to trace report

---
 nextflow.config | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 89ac3b2..1a6716d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -87,11 +87,13 @@ trace {
        + "%cpu,"      \
        + "%mem,"      \
        + "rss,"       \
+       + "peak_rss,"  \
        + "vmem,"      \
+       + "peak_vmem," \
        + "rchar,"     \
        + "wchar"
 }
-
+ 
 dag {
     enabled = true
     file = "${params.pubdir}/trace/${params.workflow}_dagPlot_${trace_timestamp}.html"