Skip to content

Commit

Permalink
Fastq align dedup bismark (#6998)
Browse files Browse the repository at this point in the history
* init subworkflow

* bismark + bismark_hisat tests

* update coverage2cytosine outputs

* update meta, tests

* rm index from snapshot

* Apply suggestions from code review

Co-authored-by: Mahesh Binzer-Panchal <[email protected]>

* apply suggestions from code review and improve tests

* rm extra params from test config

---------

Co-authored-by: Mahesh Binzer-Panchal <[email protected]>
  • Loading branch information
sateeshperi and mahesh-panchal authored Nov 16, 2024
1 parent 484afd1 commit 48f98fa
Show file tree
Hide file tree
Showing 5 changed files with 1,252 additions and 0 deletions.
153 changes: 153 additions & 0 deletions subworkflows/nf-core/fastq_align_dedup_bismark/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
include { BISMARK_ALIGN } from '../../../modules/nf-core/bismark/align/main'
include { BISMARK_DEDUPLICATE } from '../../../modules/nf-core/bismark/deduplicate/main'
include { BISMARK_METHYLATIONEXTRACTOR } from '../../../modules/nf-core/bismark/methylationextractor/main'
include { BISMARK_COVERAGE2CYTOSINE } from '../../../modules/nf-core/bismark/coverage2cytosine/main'
include { BISMARK_REPORT } from '../../../modules/nf-core/bismark/report/main'
include { BISMARK_SUMMARY } from '../../../modules/nf-core/bismark/summary/main'
include { SAMTOOLS_SORT as SAMTOOLS_SORT_DEDUPLICATED } from '../../../modules/nf-core/samtools/sort/main'
include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DEDUPLICATED } from '../../../modules/nf-core/samtools/index/main'

workflow FASTQ_ALIGN_DEDUP_BISMARK {

take:
ch_reads // channel: [ val(meta), [ reads ] ]
ch_fasta // channel: [ val(meta), [ fasta ] ]
ch_bismark_index // channel: [ val(meta), [ bismark index ] ]
skip_deduplication // boolean: whether to deduplicate alignments
cytosine_report // boolean: whether the run coverage2cytosine

main:
ch_versions = Channel.empty()
ch_coverage2cytosine_coverage = Channel.empty()
ch_coverage2cytosine_report = Channel.empty()
ch_coverage2cytosine_summary = Channel.empty()
ch_methylation_bedgraph = Channel.empty()
ch_methylation_calls = Channel.empty()
ch_methylation_coverage = Channel.empty()
ch_methylation_report = Channel.empty()
ch_methylation_mbias = Channel.empty()
ch_bismark_report = Channel.empty()
ch_bismark_summary = Channel.empty()
ch_multiqc_files = Channel.empty()

/*
* Align with bismark
*/
BISMARK_ALIGN (
ch_reads,
ch_fasta,
ch_bismark_index
)
ch_versions = ch_versions.mix(BISMARK_ALIGN.out.versions)

if (skip_deduplication) {
alignments = BISMARK_ALIGN.out.bam
alignment_reports = BISMARK_ALIGN.out.report.map{ meta, report -> [ meta, report, [] ] }
} else {
/*
* Run deduplicate_bismark
*/
BISMARK_DEDUPLICATE (
BISMARK_ALIGN.out.bam
)
alignments = BISMARK_DEDUPLICATE.out.bam
alignment_reports = BISMARK_ALIGN.out.report.join(BISMARK_DEDUPLICATE.out.report)
ch_versions = ch_versions.mix(BISMARK_DEDUPLICATE.out.versions)
}

/*
* Run bismark_methylation_extractor
*/
BISMARK_METHYLATIONEXTRACTOR (
alignments,
ch_bismark_index
)
ch_methylation_bedgraph = BISMARK_METHYLATIONEXTRACTOR.out.bedgraph
ch_methylation_calls = BISMARK_METHYLATIONEXTRACTOR.out.methylation_calls
ch_methylation_coverage = BISMARK_METHYLATIONEXTRACTOR.out.coverage
ch_methylation_report = BISMARK_METHYLATIONEXTRACTOR.out.report
ch_methylation_mbias = BISMARK_METHYLATIONEXTRACTOR.out.mbias
ch_versions = ch_versions.mix(BISMARK_METHYLATIONEXTRACTOR.out.versions)

/*
* Run bismark coverage2cytosine
*/
if (cytosine_report) {
BISMARK_COVERAGE2CYTOSINE (
BISMARK_METHYLATIONEXTRACTOR.out.coverage,
ch_fasta,
ch_bismark_index
)
ch_coverage2cytosine_coverage = BISMARK_COVERAGE2CYTOSINE.out.coverage
ch_coverage2cytosine_report = BISMARK_COVERAGE2CYTOSINE.out.report
ch_coverage2cytosine_summary = BISMARK_COVERAGE2CYTOSINE.out.summary
ch_versions = ch_versions.mix(BISMARK_COVERAGE2CYTOSINE.out.versions)
}

/*
* Generate bismark sample reports
*/
BISMARK_REPORT (
alignment_reports
.join(ch_methylation_report)
.join(ch_methylation_mbias)
)
ch_bismark_report = BISMARK_REPORT.out.report
ch_versions = ch_versions.mix(BISMARK_REPORT.out.versions)

/*
* Generate bismark summary report
*/
BISMARK_SUMMARY (
BISMARK_ALIGN.out.bam.collect{ meta, bam -> bam.name },
alignment_reports.collect{ meta, align_report, dedup_report -> align_report },
alignment_reports.collect{ meta, align_report, dedup_report -> dedup_report }.ifEmpty([]),
BISMARK_METHYLATIONEXTRACTOR.out.report.collect{ meta, report -> report },
BISMARK_METHYLATIONEXTRACTOR.out.mbias.collect{ meta, mbias -> mbias }
)
ch_bismark_summary = BISMARK_SUMMARY.out.summary
ch_versions = ch_versions.mix(BISMARK_SUMMARY.out.versions)

/*
* MODULE: Run samtools sort on dedup bam
*/
SAMTOOLS_SORT_DEDUPLICATED (
alignments,
[[:],[]] // [ [meta], [fasta]]
)
ch_versions = ch_versions.mix(SAMTOOLS_SORT_DEDUPLICATED.out.versions)

/*
* MODULE: Run samtools index on dedup bam
*/
SAMTOOLS_INDEX_DEDUPLICATED (
SAMTOOLS_SORT_DEDUPLICATED.out.bam
)
ch_versions = ch_versions.mix(SAMTOOLS_INDEX_DEDUPLICATED.out.versions)

/*
* Collect MultiQC inputs
*/
ch_multiqc_files = BISMARK_SUMMARY.out.summary
.mix(alignment_reports.collect{ meta, align_report, dedup_report -> align_report })
.mix(alignment_reports.collect{ meta, align_report, dedup_report -> dedup_report })
.mix(BISMARK_METHYLATIONEXTRACTOR.out.report.collect{ meta, report -> report })
.mix(BISMARK_METHYLATIONEXTRACTOR.out.mbias.collect{ meta, mbias -> mbias })
.mix(BISMARK_REPORT.out.report.collect{ meta, report -> report })

emit:
dedup_bam = SAMTOOLS_SORT_DEDUPLICATED.out.bam // channel: [ val(meta), [ bam ] ]
dedup_bam_index = SAMTOOLS_INDEX_DEDUPLICATED.out.bai // channel: [ val(meta), [ bai ] ]
coverage2cytosine_coverage = ch_coverage2cytosine_coverage // channel: [ val(meta), [ coverage ] ]
coverage2cytosine_report = ch_coverage2cytosine_report // channel: [ val(meta), [ report ] ]
coverage2cytosine_summary = ch_coverage2cytosine_summary // channel: [ val(meta), [ summary ] ]
methylation_bedgraph = ch_methylation_bedgraph // channel: [ val(meta), [ bedgraph ] ]
methylation_calls = ch_methylation_calls // channel: [ val(meta), [ methylation_calls ] ]
methylation_coverage = ch_methylation_coverage // channel: [ val(meta), [ coverage ] ]
methylation_report = ch_methylation_report // channel: [ val(meta), [ report ] ]
methylation_mbias = ch_methylation_mbias // channel: [ val(meta), [ mbias ] ]
bismark_report = ch_bismark_report // channel: [ val(meta), [ report ] ]
bismark_summary = ch_bismark_summary // channel: [ val(meta), [ summary ] ]
multiqc = ch_multiqc_files // path: *{html,txt}
versions = ch_versions // path: *.version.txt
}
136 changes: 136 additions & 0 deletions subworkflows/nf-core/fastq_align_dedup_bismark/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
name: "fastq_align_dedup_bismark"
description: Align BS-Seq reads to a reference genome using bismark, deduplicate and sort
keywords:
- bismark
- 3-letter genome
- map
- methylation
- 5mC
- methylseq
- bisulphite
- bisulfite
- bam
components:
- bismark/align
- samtools/sort
- samtools/index
- bismark/deduplicate
- bismark/methylationextractor
- bismark/coverage2cytosine
- bismark/report
- bismark/summary
input:
- ch_reads:
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
Structure: [ val(meta), [ path(reads) ] ]
pattern: "*.{fastq,fastq.gz}"
- ch_fasta:
type: file
description: |
Structure: [ val(meta), path(fasta) ]
pattern: "*.{fa,fa.gz}"
- ch_index:
description: |
Bismark genome index files
Structure: [ val(meta), path(index) ]
pattern: "BismarkIndex"
- skip_deduplication:
type: boolean
description: |
Skip deduplication of aligned reads
- cytosine_report:
type: boolean
description: |
Produce coverage2cytosine reports that relates methylation calls back to genomic cytosine contexts
output:
- dedup_bam:
type: file
description: |
Channel containing deduplicated BAM files.
Structure: [ val(meta), path(bam) ]
pattern: "*.bam"
- dedup_bam_index:
type: file
description: |
Channel containing deduplicated BAM index files.
Structure: [ val(meta), path(bam.bai) ]
pattern: "*.bai"
- coverage2cytosine_coverage:
type: file
description: |
Channel containing coverage information from coverage2cytosine.
Structure: [ val(meta), path(coverage.txt) ]
pattern: "*.cov.gz"
- coverage2cytosine_report:
type: file
description: |
Channel containing report from coverage2cytosine summarizing cytosine methylation coverage.
Structure: [ val(meta), path(report.txt) ]
pattern: "*report.txt.gz"
- coverage2cytosine_summary:
type: file
description: |
Channel containing summary information from coverage2cytosine.
Structure: [ val(meta), path(summary.txt) ]
pattern: "*cytosine_context_summary.txt"
- methylation_bedgraph:
type: file
description: |
Channel containing methylation data in bedGraph format.
Structure: [ val(meta), path(methylation.bedgraph) ]
pattern: "*.bedGraph.gz"
- methylation_calls:
type: file
description: |
Channel containing methylation call data.
Structure: [ val(meta), path(calls.txt) ]
pattern: "*.txt.gz"
- methylation_coverage:
type: file
description: |
Channel containing methylation coverage data.
Structure: [ val(meta), path(coverage.txt) ]
pattern: "*.cov.gz"
- methylation_report:
type: file
description: |
Channel containing methylation report detailing methylation patterns.
Structure: [ val(meta), path(report.txt) ]
pattern: "*_splitting_report.txt"
- methylation_mbias:
type: file
description: |
Channel containing M-bias report showing methylation bias across read positions.
Structure: [ val(meta), path(mbias.txt) ]
pattern: "*.M-bias.txt"
- bismark_report:
type: file
description: |
Channel containing Bismark report with mapping and methylation statistics.
Structure: [ val(meta), path(bismark_report.txt) ]
pattern: "*report.{html,txt}"
- bismark_summary:
type: file
description: |
Channel containing Bismark summary report.
Structure: [ val(meta), path(bismark_summary.txt) ]
pattern: "*report.{html,txt}"
- multiqc:
type: file
description: |
Channel containing MultiQC report aggregating results across samples.
Structure: [ val(meta), path(multiqc_report.html) ]
pattern: "*.html"
- versions:
type: file
description: |
File containing software versions.
Structure: [ path(versions.yml) ]
pattern: "versions.yml"
authors:
- "@sateeshperi"
maintainers:
- "@sateeshperi"
Loading

0 comments on commit 48f98fa

Please sign in to comment.