diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..b8d9635 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,79 @@ +# Continuous integration and delivery of the main branch. + +name: CI/CD +on: + push: + branches: + - main + +jobs: + # Release on Github and Docker + release: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.9] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Checkout github repo + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + $CONDA/bin/conda install -y -c bioconda womtool + $CONDA/bin/conda config --add channels conda-forge + $CONDA/bin/conda install -y -c conda-forge miniwdl shellcheck + echo "$CONDA/bin" >> $GITHUB_PATH + + - name: Test with tox + run: tox + + - name: Write commit message + run: git log --format=%B -n 1 ${{ github.event.after }} > release_notes.txt + + - name: Github Bumpversion Action + id: version-bump + uses: jasonamyers/github-bumpversion-action@v1.0.5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DEFAULT_BUMP: "patch" + + - name: Push changes + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + tags: true + + - name: Create Github release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: v${{ steps.version-bump.outputs.new_ver }} + release_name: 'v${{ steps.version-bump.outputs.new_ver }}' + body_path: "release_notes.txt" + draft: false + prerelease: false + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v0' + with: + credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' + + - name: Build Docker images + run: |- + gcloud config set account aou-lr-docker-builder@broad-dsp-lrma.iam.gserviceaccount.com + gcloud auth configure-docker -q us-central1-docker.pkg.dev + ./scripts/build_docker_images.sh ${{ steps.version-bump.outputs.new_ver }} + diff --git a/.github/workflows/ci_push.yml b/.github/workflows/ci_push.yml new file mode 100644 index 0000000..3f89bb3 --- /dev/null +++ b/.github/workflows/ci_push.yml @@ -0,0 +1,61 @@ +# Continuous integration test of any branch. + +name: CI +on: + push: + branches-ignore: + - main + +jobs: + # Run Python tests on Github runner + run_tox_test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.9] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Checkout github repo + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + $CONDA/bin/conda install -y -c bioconda womtool + $CONDA/bin/conda config --add channels conda-forge + $CONDA/bin/conda install -y -c conda-forge miniwdl shellcheck + echo "$CONDA/bin" >> $GITHUB_PATH + + - name: Test with tox + run: tox + + # Build Docker image + build_docker_images: + runs-on: ubuntu-latest + + steps: + - name: Checkout github repo + uses: actions/checkout@v2 + with: + submodules: recursive + + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v0' + with: + credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' + + - name: Build Docker images + run: |- + gcloud config set account aou-lr-docker-builder@broad-dsp-lrma.iam.gserviceaccount.com + gcloud auth configure-docker -q us-central1-docker.pkg.dev + ./scripts/build_docker_images.sh $(git rev-parse --abbrev-ref HEAD) + diff --git a/DEVELOP.md b/DEVELOP.md new file mode 100644 index 0000000..0f97766 --- /dev/null +++ b/DEVELOP.md @@ -0,0 +1,82 @@ +# AoU LR Playground +Tools to make working with AoU LR data easier. + +Current version: 0.0.8 + +## Development + +To do development in this codebase, the python3 development package must +be installed. + +After installation the development environment can be set up by +the following commands: + + python3 -mvenv venv + . venv/bin/activate + pip install --upgrade pip + pip install -r requirements.txt + pip install -e . + +### Linting files + + # run all linting commands + tox -e lint + + # reformat all project files + black src tests setup.py + + # sort imports in project files + isort -rc src tests setup.py + + # check pep8 against all project files + flake8 src tests setup.py + + # lint python code for common errors and codestyle issues + pylint src + +### Tests + + # run all linting and test + tox + + # run only (fast) unit tests + tox -e unit + + # run only integration tests + tox -e integration + + # run only a single test + # (in this case, the integration tests for `annotate`) + tox -e singletest -- tests/integration/test_annotate.py::test_annotate + + # run only linting + tox -e lint + +Note: If you run into "module not found" errors when running tox for testing, verify the modules are listed in test-requirements.txt and delete the .tox folder to force tox to refresh dependencies. + +## Releasing + +Docker images are released automatically upon pushes to the main branch (generally by merging branches into main). Automatic releases consist of the following steps: + +### Versioning + +We use `bumpversion` to maintain version numbers. It will automatically create a new tag each time it is run. +*DO NOT MANUALLY EDIT ANY VERSION NUMBERS.* + +Our versions are specified by a 3 number semantic version system (https://semver.org/): + + major.minor.patch + +By default, pushes to main will increment the patch number. Major and minor version numbers can be incremented through special keywords in commit messages. From the [automated-version-bump](https://github.com/marketplace/actions/automated-version-bump) Github action documentation: + +> Based on the commit messages, increment the version from the latest release. +> * If the string "BREAKING CHANGE", "major" or the Attention pattern refactor!: drop support for Node 6 is found anywhere in any of the commit messages or descriptions the major version will be incremented. +> * If a commit message begins with the string "feat" or includes "minor" then the minor version will be increased. This works for most common commit metadata for feature additions: "feat: new API" and "feature: new API". +> * If a commit message contains the word "pre-alpha" or "pre-beta" or "pre-rc" then the pre-release version will be increased (for example specifying pre-alpha: 1.6.0-alpha.1 -> 1.6.0-alpha.2 or, specifying pre-beta: 1.6.0-alpha.1 -> 1.6.0-beta.0) +All other changes will increment the patch version. + +Versions will always be bumped from the *main branch* _after_ merging in any PRs for that version. + +### Docker images + +This repository includes a Docker folder that contains subdirectories with `Dockerfile`. New images will be built automatically, named after the subdirectory in which they appear, and pushed to [Google's Artifact Registry](https://console.cloud.google.com/artifacts/docker/broad-dsp-lrma/us-central1/aou-lr) . diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1cdb08d --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +BSD 3-Clause License + +Copyright (c) 2023, Kiran V Garimella + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md deleted file mode 100644 index 7b08f2c..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# long-reads-public-codebase -This Repo will be for the sharing of code for the long reads analysis initiative. diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..682ddfd --- /dev/null +++ b/README.rst @@ -0,0 +1,33 @@ +AoU LR +"""""" + +|GitHub release| |Generic badge| + +.. |GitHub release| image:: https://img.shields.io/github/release/kvg/aou-lr.svg + :target: https://github.com/kvg/longbow/aou-lr + +.. |Generic badge| image:: https://img.shields.io/badge/Docker-v0.0.8-blue.svg + :target: https://console.cloud.google.com/artifacts/docker/broad-dsp-lrma/us-central1/aou-lr + +Analysis workflows for AoU long read data. + + +Quick start +----------- + +:: + + # Clone repo + git clone https://github.com/kvg/aou-lr.git + + +Getting help +------------ + +If you encounter bugs or have questions/comments/concerns, please file an issue on our `Github page `_. + + +Developers' guide +----------------- + +For information on contributing to development, visit our `developer documentation `_. diff --git a/docker/aou-lr-sv/Dockerfile b/docker/aou-lr-sv/Dockerfile new file mode 100644 index 0000000..4363b29 --- /dev/null +++ b/docker/aou-lr-sv/Dockerfile @@ -0,0 +1,17 @@ +FROM continuumio/miniconda3 + +MAINTAINER Kiran V Garimella + +# copy other resources +COPY ./environment.yml / + +# install conda packages +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH=/opt/conda/envs/aou-lr-sv/bin/:${PATH} +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/aou-lr-sv/lib/ + +RUN apt-get -y update \ + && apt-get -y install git make cmake protobuf-compiler gcc g++ zlib1g-dev libcurl4-openssl-dev libbz2-dev tree python3-pip liblzma-dev wget curl \ + && apt-get clean + +RUN echo "source activate aou-lr-sv" > ~/.bashrc diff --git a/docker/aou-lr-sv/environment.yml b/docker/aou-lr-sv/environment.yml new file mode 100644 index 0000000..f96832c --- /dev/null +++ b/docker/aou-lr-sv/environment.yml @@ -0,0 +1,8 @@ +name: aou-lr-sv +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - minimap2 +prefix: /opt/conda/envs/aou-lr-sv diff --git a/inputs/HelloWorkflow.json b/inputs/HelloWorkflow.json new file mode 100644 index 0000000..885bf2e --- /dev/null +++ b/inputs/HelloWorkflow.json @@ -0,0 +1,3 @@ +{ + "HelloWorkflow.greeting": "Hi there!" +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8f67cbd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +google-cloud-storage +numpy +pysam +pytest +pytest-console-scripts +tox diff --git a/resources/workflow_options/default.json b/resources/workflow_options/default.json new file mode 100644 index 0000000..627aa5f --- /dev/null +++ b/resources/workflow_options/default.json @@ -0,0 +1,5 @@ +{ + "write_to_cache": true, + "read_from_cache": true, + "workflow_failure_mode": "ContinueWhilePossible", +} diff --git a/resources/workflow_options/fresh.run.json b/resources/workflow_options/fresh.run.json new file mode 100644 index 0000000..4e6eec9 --- /dev/null +++ b/resources/workflow_options/fresh.run.json @@ -0,0 +1,5 @@ +{ + "write_to_cache": false, + "read_from_cache": false, + "workflow_failure_mode": "ContinueWhilePossible", +} diff --git a/run_hello_workflow.sh b/run_hello_workflow.sh new file mode 100755 index 0000000..89ec11e --- /dev/null +++ b/run_hello_workflow.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -euxo pipefail + +RESOURCES="resources/workflow_options/default.json" +CA="cromshell-alpha -q --hide_logo -t 30 submit -d ./wdl" + +$CA wdl/pipelines/HelloWorkflow.wdl inputs/HelloWorkflow.json diff --git a/scripts/build_docker_images.sh b/scripts/build_docker_images.sh new file mode 100755 index 0000000..96fdd62 --- /dev/null +++ b/scripts/build_docker_images.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -euxo pipefail + +DOCKER_REPO="aou-lr" +LABEL=$1 + +for DOCKER_FILE in $(find docker -name Dockerfile) +do + DIR_NAME=$(dirname $DOCKER_FILE) + DOCKER_NAME=$(basename $DIR_NAME) + + TAG="us-central1-docker.pkg.dev/broad-dsp-lrma/$DOCKER_REPO/$DOCKER_NAME:$LABEL" + + if docker manifest inspect $TAG > /dev/null; then + docker pull $TAG + fi + + docker build -t $TAG $DIR_NAME && docker push $TAG +done diff --git a/test/test_scripts/test_wdl_validity.py b/test/test_scripts/test_wdl_validity.py new file mode 100644 index 0000000..4daae5d --- /dev/null +++ b/test/test_scripts/test_wdl_validity.py @@ -0,0 +1,31 @@ +import errno +import glob +import os +import subprocess +import pytest +from pathlib import Path + + +@pytest.mark.parametrize( + "tool, subcommand, dir_names", + ( + ["womtool", "validate", ["wdl"]], + ["miniwdl", "check", ["wdl/tasks"]], # miniwdl validation is stricter and takes more time than womtool, so we only run it on 'tasks' since 'deprecated' will eventually be deleted. + ) +) +def test_wdl_validity(script_runner, tool: str, subcommand: str, dir_names: list): + + for dir_name in dir_names: + retsum = 0 + wdl_dir = Path(__file__).resolve().parents[2] / dir_name + if not wdl_dir.exists(): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), wdl_dir) + wdl_paths = wdl_dir / "**/*.wdl" + + for wdl in glob.glob(str(wdl_paths), recursive=True): + print(f'{wdl}:') + ret = subprocess.run([tool, subcommand, wdl]) + retsum += ret.returncode + + assert retsum == 0 diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..85e638a --- /dev/null +++ b/tox.ini @@ -0,0 +1,11 @@ +[tox] +envlist = py39 +skipsdist = true + +[testenv] +# install pytest in the virtualenv where commands will be executed +deps = -r ./requirements.txt +script_launch_mode = subprocess +commands = + # NOTE: you can run any command line tool here - not just tests + pytest test/test_scripts/test_wdl_validity.py diff --git a/wdl/lib/.gitignore b/wdl/lib/.gitignore new file mode 100644 index 0000000..82dcb7b --- /dev/null +++ b/wdl/lib/.gitignore @@ -0,0 +1,2 @@ +*.json +*.zip diff --git a/wdl/lib/Alignment/AlignAndCheckFingerprintCCS.wdl b/wdl/lib/Alignment/AlignAndCheckFingerprintCCS.wdl new file mode 100644 index 0000000..9b0fb60 --- /dev/null +++ b/wdl/lib/Alignment/AlignAndCheckFingerprintCCS.wdl @@ -0,0 +1,135 @@ +version 1.0 + +import "../QC/FPCheckAoU.wdl" as FPCheck +import "../QC/CollectPacBioAlignedMetrics.wdl" as AlnMetrics +import "../Utility/PBUtils.wdl" as PB +import "../Utility/Utils.wdl" +import "../Utility/GeneralUtils.wdl" + +workflow AlignAndCheckFingerprintCCS { + meta { + desciption: + "Given an unaligned CCS/HiFi BAM for a sample, align and verify fingerprint." + } + + input { + File uBAM + File uPBI + String bam_sample_name + String library + + Boolean turn_off_fingperprint_check + String fp_store + String sample_id_at_store + + File ref_map_file + } + + parameter_meta { + turn_off_fingperprint_check: "Please turn of fingerprint check if the reference is not GRCh38." + fp_store: "Bucket name and prefix (gs://...) storing the fingerprinting VCFs" + sample_id_at_store: "Name of the sample at the fingerprint store." + + # outputs + alignment_metrics_tar_gz : "A tar.gz file holding the custom alignment metrics." + + fp_status : "A summary string on the fingerprint check result, value is one of [PASS, FAIL, BORDERLINE]." + fp_lod_expected_sample : "An LOD score assuming the BAM is the same sample as the FP VCF, i.e. BAM sourced from the 'expected sample'." + fingerprint_detail_tar_gz : "A tar.gz file holding the fingerprinting details." + } + + Map[String, String] ref_map = read_map(ref_map_file) + + ################################################################################### + if (ceil(size(uBAM, "GB")) > 50) {# shard & align, but practically never true + + Map[String, String] map_presets = { + 'CLR': 'SUBREAD', + 'CCS': 'CCS', + 'ISOSEQ': 'ISOSEQ', + 'MASSEQ': 'SUBREAD', + } + + call Utils.ComputeAllowedLocalSSD as Guess {input: intended_gb = 3*ceil(size(uBAM, "GB") + size(uPBI, "GB"))} + call Utils.RandomZoneSpewer as arbitrary {input: num_of_zones = 3} + + call PB.ShardLongReads { + input: + unaligned_bam = uBAM, unaligned_pbi = uPBI, + num_shards = 50, num_ssds = Guess.numb_of_local_ssd, zones = arbitrary.zones + } + + scatter (unaligned_bam in ShardLongReads.unmapped_shards) { + # sometimes we see the sharded bams mising EOF marker, which then fails record counts, use this as a checkpoint + call Utils.CountBamRecords as ValidateShard {input: bam = unaligned_bam} + + call PB.Align as AlignReads { + input: + bam = unaligned_bam, + ref_fasta = ref_map['fasta'], + sample_name = bam_sample_name, + library = library, + map_preset = map_presets['CCS'], + drop_per_base_N_pulse_tags = true + } + # call Utils.BamToFastq { input: bam = unaligned_bam, prefix = basename(unaligned_bam, ".bam") } + } + + call Utils.MergeBams as MergeAlignedReads { input: bams = AlignReads.aligned_bam, prefix = basename(uBAM, ".bam") } + # call Utils.MergeFastqs as MergeAllFastqs { input: fastqs = BamToFastq.reads_fq } + } + if (! (ceil(size(uBAM, "GB")) > 50)) { + call PB.Align as AlignReadsTogether { + input: + bam = uBAM, + ref_fasta = ref_map['fasta'], + sample_name = bam_sample_name, + library = library, + map_preset = 'CCS', + drop_per_base_N_pulse_tags = true + } + } + + File aBAM = select_first([MergeAlignedReads.merged_bam, AlignReadsTogether.aligned_bam]) + File aBAI = select_first([MergeAlignedReads.merged_bai, AlignReadsTogether.aligned_bai]) + call PB.PBIndex as IndexAlignedReads { input: bam = aBAM } + + ################################################################################### + # alignment metrics and fingerprint check + call AlnMetrics.CollectPacBioAlignedMetrics { + input: + aligned_bam = aBAM, + aligned_bai = aBAI, + aligned_pbi = IndexAlignedReads.pbi + } + + call GeneralUtils.TarGZFiles as saveAlnMetrics { + input: + files = flatten([[CollectPacBioAlignedMetrics.custom_aln_metrics_summary, CollectPacBioAlignedMetrics.nanoplot_stats], CollectPacBioAlignedMetrics.nanoplot_pngs]), + name = "alignment.metrics" + } + + if (!turn_off_fingperprint_check){ + call FPCheck.FPCheckAoU { + input: + aligned_bam = aBAM, + aligned_bai = aBAI, + fp_store = fp_store, + sample_id_at_store = sample_id_at_store, + ref_specific_haplotype_map = ref_map['haplotype_map'] + } + call GeneralUtils.TarGZFiles as saveFPRes {input: files = [FPCheckAoU.fingerprint_summary, FPCheckAoU.fingerprint_details], name = 'fingerprint_check.summary_and_details'} + } + + output { + File aligned_bam = aBAM + File aligned_bai = aBAI + File aligned_pbi = IndexAlignedReads.pbi + + File alignment_metrics_tar_gz = saveAlnMetrics.you_got_it + + Float? fp_lod_expected_sample = FPCheckAoU.lod_expected_sample + String? fp_status = FPCheckAoU.FP_status + File? fingerprint_detail_tar_gz = saveFPRes.you_got_it + } +} diff --git a/wdl/lib/Alignment/AlignReads.wdl b/wdl/lib/Alignment/AlignReads.wdl new file mode 100644 index 0000000..4f32c04 --- /dev/null +++ b/wdl/lib/Alignment/AlignReads.wdl @@ -0,0 +1,215 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +# A wrapper to minimap2 for mapping & aligning (groups of) sequences to a reference +task Minimap2 { + input { + Array[File] reads + File ref_fasta + + String RG + String map_preset + + String? library + + Array[String] tags_to_preserve = [] + + String prefix = "out" + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + reads: "query sequences to be mapped and aligned" + ref_fasta: "reference fasta" + RG: "read group information to be supplied to parameter '-R' (note that tabs should be input as '\t')" + map_preset: "preset to be used for minimap2 parameter '-x'" + tags_to_preserve: "sam tags to carry over to aligned bam file" + prefix: "[default-valued] prefix for output BAM" + } + + Boolean fix_library_entry = if defined(library) then true else false + + Int disk_size = 1 + 10*2*2*ceil(size(reads, "GB") + size(ref_fasta, "GB")) + + Boolean do_preserve_tags = if length(tags_to_preserve) != 0 then true else false + + Int cpus = 4 + Int mem = 30 + + command <<< + set -euxo pipefail + + NUM_CPUS=$( cat /proc/cpuinfo | grep '^processor' | tail -n1 | awk '{print $NF+1}' ) + RAM_IN_GB=$( free -g | grep "^Mem" | awk '{print $2}' ) + MEM_FOR_SORT=$( echo "" | awk "{print int(($RAM_IN_GB - 1)/$NUM_CPUS)}" ) + + rg_len=$(echo -n '~{RG}' | wc -c | awk '{print $NF}') + if [[ $rg_len -ne 0 ]] ; then + MAP_PARAMS="-ayYL --MD --eqx -x ~{map_preset} -R ~{RG} -t ${NUM_CPUS} ~{ref_fasta}" + else + MAP_PARAMS="-ayYL --MD --eqx -x ~{map_preset} -t ${NUM_CPUS} ~{ref_fasta}" + fi + + SORT_PARAMS="-@${NUM_CPUS} -m${MEM_FOR_SORT}G --no-PG -o ~{prefix}.pre.bam" + FILE="~{reads[0]}" + FILES="~{sep=' ' reads}" + + # We write to a SAM file before sorting and indexing because rarely, doing everything + # in a single one-liner leads to a truncated file error and segmentation fault of unknown + # origin. Separating these commands requires more resources, but is more reliable overall. + + if [[ "$FILE" =~ \.fastq$ ]] || [[ "$FILE" =~ \.fq$ ]]; then + cat $FILES | minimap2 $MAP_PARAMS - > tmp.sam + elif [[ "$FILE" =~ \.fastq.gz$ ]] || [[ "$FILE" =~ \.fq.gz$ ]]; then + zcat $FILES | minimap2 $MAP_PARAMS - > tmp.sam + elif [[ "$FILE" =~ \.fasta$ ]] || [[ "$FILE" =~ \.fa$ ]]; then + cat $FILES | python3 /usr/local/bin/cat_as_fastq.py | minimap2 $MAP_PARAMS - > tmp.sam + elif [[ "$FILE" =~ \.fasta.gz$ ]] || [[ "$FILE" =~ \.fa.gz$ ]]; then + zcat $FILES | python3 /usr/local/bin/cat_as_fastq.py | minimap2 $MAP_PARAMS - > tmp.sam + elif [[ "$FILE" =~ \.bam$ ]]; then + + # samtools fastq takes only 1 file at a time so we need to merge them together: + for f in "~{sep=' ' reads}" ; do + if ~{do_preserve_tags} ; then + samtools fastq -T ~{sep=',' tags_to_preserve} "$f" + else + samtools fastq "$f" + fi + done > tmp.fastq + + echo "Memory info:" + cat /proc/meminfo + echo "" + + if ~{do_preserve_tags} ; then + minimap2 ${MAP_PARAMS} -y tmp.fastq > tmp.sam + else + minimap2 ${MAP_PARAMS} tmp.fastq > tmp.sam + fi + else + echo "Did not understand file format for '$FILE'" + exit 1 + fi + + samtools sort ${SORT_PARAMS} tmp.sam + + if ~{fix_library_entry}; then + mv ~{prefix}.pre.bam ~{prefix}.pre.tmp.bam + samtools view --no-PG -H ~{prefix}.pre.tmp.bam > header.txt + awk '$1 ~ /^@RG/' header.txt > rg_line.txt + awk -v lib="~{library}" 'BEGIN {OFS="\t"} { for (i=1; i<=NF; ++i) { if ($i ~ "LB:") $i="LB:"lib } print}' \ + rg_line.txt \ + > fixed_rg_line.txt + sed -n '/@RG/q;p' header.txt > first_half.txt + sed -n '/@RG/,$p' header.txt | sed '1d' > second_half.txt + + cat first_half.txt fixed_rg_line.txt second_half.txt > fixed_header.txt + + date + samtools reheader fixed_header.txt ~{prefix}.pre.tmp.bam > ~{prefix}.pre.bam + rm ~{prefix}.pre.tmp.bam + date + fi + + samtools calmd -b --no-PG ~{prefix}.pre.bam ~{ref_fasta} > ~{prefix}.bam + samtools index -@${NUM_CPUS} ~{prefix}.bam + >>> + + output { + File aligned_bam = "~{prefix}.bam" + File aligned_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: cpus, + mem_gb: mem, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# A simple task to covert SAM-formatted alignment to PAF format +task SAMtoPAF { + input { + File sam_formatted_file + File? index + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + sam_formatted_file: "SAM-formated input file to be converted to PAF (note currently we only support SAM or BAM, not CRAM)" + index: "[optional] index for sam_formatted_file" + } + + String prefix = basename(basename(sam_formatted_file, ".bam"), ".sam") # we have hack like this because WDL stdlib doesn't provide endsWith stuff + + Int disk_size = 2*ceil(size(sam_formatted_file, "GB")) + + command <<< + set -eu + + MM2_VERSION="2.24" + + filename=$(basename -- ~{sam_formatted_file}) + extension="${filename##*.}" + if [[ "$extension" == "sam" ]]; then + /minimap2-${MM2_VERSION}_x64-linux/k8 \ + /minimap2-${MM2_VERSION}_x64-linux/paftools.js \ + sam2paf \ + -L \ + ~{sam_formatted_file} \ + > ~{prefix}".paf" + elif [[ "$extension" == "bam" ]]; then + samtools view -h ~{sam_formatted_file} | \ + /minimap2-${MM2_VERSION}_x64-linux/k8 \ + /minimap2-${MM2_VERSION}_x64-linux/paftools.js \ + sam2paf \ + -L \ + - \ + > ~{prefix}".paf" + else + echo "Currently we only support SAM or BAM (not CRAM)." && exit 1; + fi + >>> + + output { + File pat_formatted_file = "~{prefix}.paf" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: "~{disk_size}", + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Assembly/Canu.wdl b/wdl/lib/Assembly/Canu.wdl new file mode 100644 index 0000000..474736b --- /dev/null +++ b/wdl/lib/Assembly/Canu.wdl @@ -0,0 +1,223 @@ +version 1.0 + +########################################################################################## +# A workflow that runs the Canu 3-step assembly (correct, trim, assemble). +# - Tested on a small genome (malaria ~23mb), larger genomes may require some changes +# including tweaks to the default resource allocation. +# - Currently assumes nanopore reads +########################################################################################## + +import "../../structs/Structs.wdl" + +workflow Canu { + input { + File reads + + Int genome_size + Float correct_error_rate + Float trim_error_rate + Float assemble_error_rate + + String prefix + } + + call Correct { + input: + reads = reads, + genome_size = genome_size, + error_rate = correct_error_rate, + prefix = prefix + } + + call Trim { + input: + genome_size = genome_size, + corrected_reads = Correct.corrected_reads, + error_rate = trim_error_rate, + prefix = prefix, + } + + call Assemble { + input: + genome_size = genome_size, + trimmed_reads = Trim.trimmed_reads, + error_rate = assemble_error_rate, + prefix = prefix, + } + + output { + File fa = Assemble.canu_contigs_fasta + } +} + +# performs canu correct on raw reads, currently assumes ONT reads +task Correct { + input { + File reads + Int genome_size + Float error_rate + String prefix + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + reads: "reads to be canu-corrected" + genome_size: "estimate on genome size (parameter to canu's 'genomeSize')" + error_rate: "parameter to canu's 'correctedErrorRate'" + prefix: "prefix to output files" + } + + Int disk_size = 150 * ceil(size(reads, "GB")) + + command <<< + set -euxo pipefail + + canu -correct \ + -p ~{prefix} -d canu_correct_output \ + genomeSize=~{genome_size}m \ + corMaxEvidenceErate=0.15 \ + correctedErrorRate=~{error_rate} \ + -nanopore \ + ~{reads} + >>> + + output { + File corrected_reads = "canu_correct_output/~{prefix}.correctedReads.fasta.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 32, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-canu:0.1.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# performs canu trim on corrected reads +task Trim { + input { + File corrected_reads + Int genome_size + Float error_rate + String prefix + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + corrected_reads: "reads that have been canu-corrected" + genome_size: "estimate on genome size (parameter to canu's 'genomeSize')" + error_rate: "parameter to canu's 'correctedErrorRate'" + prefix: "prefix to output files" + } + + Int disk_size = 50 * ceil(size(corrected_reads, "GB")) + + command <<< + set -euxo pipefail + + canu -trim \ + -p ~{prefix} -d canu_trim_output \ + genomeSize=~{genome_size}m \ + correctedErrorRate=~{error_rate} \ + -nanopore-corrected \ + ~{corrected_reads} + >>> + + output { + File trimmed_reads = "canu_trim_output/~{prefix}.trimmedReads.fasta.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 32, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-canu:0.1.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# performs assembly on corrected, then trimmmed reads +task Assemble { + input { + Int genome_size + File trimmed_reads + Float error_rate + String prefix + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + trimmed_reads: "reads that have been canu-corrected-trimmed" + genome_size: "estimate on genome size (parameter to canu's 'genomeSize')" + error_rate: "parameter to canu's 'correctedErrorRate'" + prefix: "prefix to output files" + } + + Int disk_size = 50 * ceil(size(trimmed_reads, "GB")) + + command <<< + set -euxo pipefail + + canu -assemble \ + -p ~{prefix} -d canu_assemble_output \ + genomeSize=~{genome_size}m \ + correctedErrorRate=~{error_rate} \ + -nanopore-corrected \ + ~{trimmed_reads} + >>> + + output { + File canu_contigs_fasta = "canu_assemble_output/~{prefix}.contigs.fasta" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 32, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-canu:0.1.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/lib/Assembly/Flye.wdl b/wdl/lib/Assembly/Flye.wdl new file mode 100644 index 0000000..1445e2e --- /dev/null +++ b/wdl/lib/Assembly/Flye.wdl @@ -0,0 +1,76 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +workflow Flye { + input { + File reads + Float genome_size + String prefix + } + + call Assemble { + input: + reads = reads, + prefix = prefix, + runtime_attr_override = { 'mem_gb': 100.0 + (genome_size/10000000.0) } + } + + output { + File gfa = Assemble.gfa + File fa = Assemble.fa + } +} + +task Assemble { + input { + File reads + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + reads: "reads (in fasta or fastq format, compressed or uncompressed)" + prefix: "prefix to apply to assembly output filenames" + } + + Int disk_size = 10 * ceil(size(reads, "GB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + flye --nano-raw ~{reads} --threads $num_core --out-dir asm + + mv asm/assembly.fasta ~{prefix}.flye.fa + mv asm/assembly_graph.gfa ~{prefix}.flye.gfa + >>> + + output { + File gfa = "~{prefix}.flye.gfa" + File fa = "~{prefix}.flye.fa" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 100, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-flye:2.8.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Assembly/Hifiasm.wdl b/wdl/lib/Assembly/Hifiasm.wdl new file mode 100644 index 0000000..53a45aa --- /dev/null +++ b/wdl/lib/Assembly/Hifiasm.wdl @@ -0,0 +1,210 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +import "../Utility/Utils.wdl" + +workflow Hifiasm { + meta { + description: "We run two HiFiasm jobs, one for getting alternative contigs and one for getting the haplotigs. And we take the primary assembly from the first job." + } + + input { + File reads + String prefix + + String zones = "us-central1-a us-central1-b us-central1-c" + } + + parameter_meta { + reads: "reads (in fasta or fastq format, compressed or uncompressed)" + prefix: "prefix to apply to assembly output filenames" + } + + call AssembleForAltContigs { + input: + reads = reads, + prefix = prefix, + zones = zones + } + + call AssembleForHaplotigs { + input: + reads = reads, + prefix = prefix, + zones = zones + } + + output { + File primary_gfa = AssembleForAltContigs.primary_gfa + File primary_tigs = AssembleForAltContigs.primary_tigs + + File alternate_gfa = AssembleForAltContigs.alternate_gfa + File alternate_tigs = AssembleForAltContigs.alternate_tigs + + File log_in_pVSa_mode = AssembleForAltContigs.log + + ########### + Array[File] phased_gfas = AssembleForHaplotigs.phased_gfas + Array[File] phased_tigs = AssembleForHaplotigs.phased_tigs + + File log_in_hap_mode = AssembleForHaplotigs.log + + # these two are saved, but the one generated in the primary VS alternate mode are preferred + File primary_gfa_in_hap_mode = AssembleForHaplotigs.primary_gfa + File primary_tigs_in_hap_mode = AssembleForHaplotigs.primary_fa + } +} + +task AssembleForHaplotigs { + input { + File reads + String prefix = "out" + String zones + + RuntimeAttr? runtime_attr_override + } + + Int proposed_memory = 4 * ceil(size(reads, "GB")) + Int memory = if proposed_memory < 96 then 96 else proposed_memory # this 96 magic number is purely empirical + Int n = memory / 4 # this might be an odd number + Int num_cpus_proposal = if (n/2)*2 == n then n else n+1 # a hack because WDL doesn't have modulus operator + Int num_cpus = if num_cpus_proposal > 96 then 96 else num_cpus_proposal + + Int disk_size = 10 * ceil(size(reads, "GB")) + + command <<< + set -euxo pipefail + + time hifiasm \ + -o ~{prefix} \ + -t~{num_cpus} \ + ~{reads} \ + 2>&1 | tee hifiasm.log + + tree -h . + + # GFA graph to contigs, primary + # outputs generated this way has "bp" in their names + awk '/^S/{print ">"$2; print $3}' \ + ~{prefix}.bp.p_ctg.gfa \ + > ~{prefix}.bp.p_ctg.fa + + ls "~{prefix}.bp.hap"*".p_ctg.gfa" + + # GFA graph to contigs, for each haplotig set + for haplotype_gfa in ~{prefix}.bp.hap*.p_ctg.gfa; do + filename=$(basename -- "${haplotype_gfa}") + haplotype="${filename%.*}" + awk '/^S/{print ">"$2; print $3}' \ + "${haplotype_gfa}" \ + > "${haplotype}".fa + done + >>> + + output { + # these are saved, but the one with alt contigs genearted will be preferred for now + File primary_gfa = "~{prefix}.bp.p_ctg.gfa" + File primary_fa = "~{prefix}.bp.p_ctg.fa" + + Array[File] phased_gfas = glob("~{prefix}.bp.hap*.p_ctg.gfa") + Array[File] phased_tigs = glob("~{prefix}.bp.hap*.p_ctg.fa") + + File log = "hifiasm.log" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-hifiasm:0.16.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + zones: zones + } +} + +task AssembleForAltContigs { + input { + File reads + String prefix = "out" + String zones + + RuntimeAttr? runtime_attr_override + } + Int proposed_memory = 4 * ceil(size(reads, "GB")) + Int memory = if proposed_memory < 96 then 96 else if proposed_memory > 512 then 512 else proposed_memory # this 96 magic number is purely empirical + Int n = memory / 4 # this might be an odd number + Int num_cpus_proposal = if (n/2)*2 == n then n else n+1 # a hack because WDL doesn't have modulus operator + Int num_cpus = if num_cpus_proposal > 96 then 96 else num_cpus_proposal + + Int disk_size = 10 * ceil(size(reads, "GB")) + + command <<< + set -euxo pipefail + + time hifiasm \ + -o ~{prefix} \ + -t~{num_cpus} \ + --primary \ + ~{reads} \ + 2>&1 | tee hifiasm.log + + tree -h . + + # tricky, outputs generated this way has no "bp" in their file names + # GFA graph to contigs, primary + awk '/^S/{print ">"$2; print $3}' \ + ~{prefix}.p_ctg.gfa \ + > ~{prefix}.p_ctg.fa + + # GFA graph to contigs, alternate + awk '/^S/{print ">"$2; print $3}' \ + ~{prefix}.a_ctg.gfa \ + > ~{prefix}.a_ctg.fa + >>> + + output { + File primary_gfa = "~{prefix}.p_ctg.gfa" + File primary_tigs = "~{prefix}.p_ctg.fa" + + File alternate_gfa = "~{prefix}.a_ctg.gfa" + File alternate_tigs = "~{prefix}.a_ctg.fa" + + File log = "hifiasm.log" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-hifiasm:0.16.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + zones: zones + } +} diff --git a/wdl/lib/Assembly/Racon.wdl b/wdl/lib/Assembly/Racon.wdl new file mode 100644 index 0000000..cca31a3 --- /dev/null +++ b/wdl/lib/Assembly/Racon.wdl @@ -0,0 +1,59 @@ +version 1.0 + +########################################################################################## +# A task that polishes a draft assembly with long reads using Racon. +# - Recommended to run a few times +########################################################################################## + +task RaconPolish { + input { + File reads + File draft_assembly + + Int n_rounds + } + + parameter_meta { + reads: "long reads to polish the draft assembly with" + draft_assembly: "draft to be polished" + n_rounds: "Number of times to run Racon" + } + + Int mem_size = 4 * ceil(size(reads, "GB") + size(draft_assembly, "GB")) + Int disk_size = mem_size + + command <<< + set -euxo pipefail + + cp ~{draft_assembly} input_draft.fasta + for i in {1..~{n_rounds}} + do + minimap2 -ax map-ont input_draft.fasta ~{reads} > aln.sam + racon -c 1 -m 8 -x -6 -g -8 -w 500 -t 8 ~{reads} aln.sam input_draft.fasta > polished_${i}_draft.fasta + cp polished_${i}_draft.fasta input_draft.fasta + done + >>> + + output { + File final_polished_assembly = "input_draft.fasta" + Array[File] incremental_polished_assemblies = glob("polished_*_draft.fasta") + } + + runtime { + cpu: 8 + # Racon has a high memory requirement. Not sure what it is exactly but you need at least + # the size of the generated alignment file and more + memory: mem_size + " GiB" + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: 30 + preemptible: 0 + maxRetries: 0 + gpuType: "nvidia-tesla-t4" + gpuCount: 1 + nvidiaDriverVersion: "418.152.00" + zones: ["us-east1-c"] + cpuPlatform: "Intel Haswell" + docker: "us.gcr.io/broad-dsp-lrma/lr-racon:0.1.0" + } + +} diff --git a/wdl/lib/Preprocessing/CollectParentsKmerStats.wdl b/wdl/lib/Preprocessing/CollectParentsKmerStats.wdl new file mode 100644 index 0000000..8661bc6 --- /dev/null +++ b/wdl/lib/Preprocessing/CollectParentsKmerStats.wdl @@ -0,0 +1,498 @@ +version 1.0 + +########################################################################################## +## A workflow that performs trio-binning of child long reads given parental (short) reads. +## Based on the trio-canu publication +## De novo assembly of haplotype-resolved genomes with trio binning +## https://www.nature.com/articles/nbt.4277 +## This holds the sub-workflow for +## part one: collect k-mer stats given parental (short) reads +########################################################################################## + +import "../../structs/Structs.wdl" + +workflow CollectParentsKmerStats { + input{ + + String workdir_name + + String genome_size + Int? kmerSize + + String father_short_reads_bucket + String mother_short_reads_bucket + + Int meryl_operations_threads_est = 8 + + Boolean? run_with_debug = false + } + + parameter_meta { + workdir_name: "name of working directory" + genome_size: "an esimate on genome size of the specicies (affects k-value picking)" + kmerSize: "[optional] force specifying k-value in collecting k-mer stats on parents" + + father_short_reads_bucket: "GCS bucket path holding FASTA/FASTQ of (short) reads of paternal origin" + mother_short_reads_bucket: "GCS bucket path holding FASTA/FASTQ of (short) reads of maternal origin" + + meryl_operations_threads_est: "[default-valued] estimate on how many threads to allocate to k-mer stats collection step" + run_with_debug: "[optional] whether to run in debug mode (takes significantly more disk space and more logs); defaults to false" + } + + ############################################################################ + # we based the implementation of this workflow on a forked canu v1.9 + # the original canu 1.9 is not cloud-friendly + call ParentalReadsRepartitionAndMerylConfigure { + input: + workdir_name = workdir_name, + genome_size = genome_size, + kmerSize = kmerSize, + father_short_reads_bucket = father_short_reads_bucket, + mother_short_reads_bucket = mother_short_reads_bucket, + meryl_operations_threads_est = meryl_operations_threads_est, + run_with_debug = run_with_debug + } + + call PrintMerylMemory { + input: + meryl_memory_file = ParentalReadsRepartitionAndMerylConfigure.meryl_memory + } + + scatter (pair in ParentalReadsRepartitionAndMerylConfigure.batch_id_and_parental_short_reads_tar) { + call MerylCount { + input: + workdir_name = workdir_name, + batch_id_hold_file = pair.left, + parental_reads_for_this_batch = pair.right, + meryl_count_script = ParentalReadsRepartitionAndMerylConfigure.count_script, + kmerSize = kmerSize, + meryl_operations_threads_est = meryl_operations_threads_est, + meryl_memory_in_GB = PrintMerylMemory.meryl_memory_in_GB + } + } + + call MerylMergeAndSubtract { + input: + meryl_merge_script = ParentalReadsRepartitionAndMerylConfigure.merge_script, + meryl_subtract_script = ParentalReadsRepartitionAndMerylConfigure.subtract_script, + meryl_count_batches = MerylCount.count_output, + meryl_operations_threads_est = meryl_operations_threads_est, + meryl_memory_in_GB = PrintMerylMemory.meryl_memory_in_GB, + run_with_debug = run_with_debug + } + + output { + Array[File] Father_haplotype_merylDB = MerylMergeAndSubtract.subtract_output_father + Array[File] Mother_haplotype_merylDB = MerylMergeAndSubtract.subtract_output_mother + + File Father_reads_statistics = MerylMergeAndSubtract.merge_stats_father + File Mother_reads_statistics = MerylMergeAndSubtract.merge_stats_mother + } +} + +############################################################### + +# repartition the parental short reads for easier batch-processing by canu/meryl itself +# note that this step is IO bound +task ParentalReadsRepartitionAndMerylConfigure { + input{ + + String workdir_name + + String genome_size + Int? kmerSize + + String father_short_reads_bucket + String mother_short_reads_bucket + + Int meryl_operations_threads_est + + Boolean run_with_debug = false + + RuntimeAttr? runtime_attr_override + } + + String debug_option = if (run_with_debug) then "-debug" else " " + String kmer_option = if (defined(kmerSize)) then ("-triobinK " + select_first([kmerSize])) else " " + String extra_args = kmer_option + debug_option + + command <<< + set -euo pipefail + + # parallel localize the input reads (remove trailing slash first to be safe) + father_path=$(echo ~{father_short_reads_bucket} | sed 's:/*$::') + mother_path=$(echo ~{mother_short_reads_bucket} | sed 's:/*$::') + a=$(gsutil ls "${father_path}/"*.fastq.gz | wc -l) + b=$(gsutil ls "${mother_path}/"*.fastq.gz | wc -l) + if [[ $a == 0 ]]; then + echo "no reads in ~{father_short_reads_bucket}" && exit 1 + elif [[ $b == 0 ]]; then + echo "no reads in ~{mother_short_reads_bucket}" && exit 1 + fi + echo "===================================" + echo "BEGIN LOCALIZING PARENTAL READS" + date -u + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + mkdir father && gsutil -mq cp "${father_path}/"*.fastq.gz father/ && echo "Father localized" + mkdir mother && gsutil -mq cp "${mother_path}/"*.fastq.gz mother/ && echo "Mother localized" + date -u + echo "DONE LOCALIZING PARENTAL READS" + echo "===================================" + + ########## + # re-partition parental reads and + # very importantly, we don't stop at that, we stop after parental kmer stats configure + # (i.e. meryl-count, -merge, and -subtract) because + # the custom docker allows us to output a batch-specific tar.gz of parental read files + # but we don't use the configuration scripts in later stages + echo "===================================" + date -u + canu \ + -haplotype \ + -p ~{workdir_name} \ + -d /cromwell_root/workdir/ \ + genomeSize=~{genome_size} \ + stopAfter=parent-kmer-stat-conf \ + -haplotypeFather /cromwell_root/father/*.fastq.gz \ + -haplotypeMother /cromwell_root/mother/*.fastq.gz \ + ~{extra_args} + date -u + tree workdir # helps debugging, in case something went wrong + # tar the repartitioned reads according to batches + cd workdir/haplotype/0-kmers/ + for dict in *.dict; do + new_name=$(echo $dict | sed 's/dict//') + cat $dict | tar -czf $new_name"batch.tar.gz" -T - & + done + wait + cd - + echo "===================================" + ########## + + # move configured shell scripts up for delocalization + mv workdir/haplotype/0-kmers/meryl-count.memory . + mv workdir/haplotype/0-kmers/*.sh . + # then sed replace the thread configuration: + # 1. recall that memory was set purely based on file count hence no-need/better-not change + # 2. the number of threads was configured above using threads available on this VM + # and we don't want that. + th_cnt=~{meryl_operations_threads_est} + for script in *.sh; do + sed -i -E "s/threads=[0-9]+/threads=$th_cnt/g" $script; + done + + # move parental reads up for delocalization + mv workdir/haplotype/0-kmers/*.dict . + mv workdir/haplotype/0-kmers/*.batch.tar.gz . + # grep and sort, then generate an array of flat files that just hold the "$hap-bathid" + for batch_id in `grep -Eo "output=\"(Father|Mother)-[0-9]+\"" meryl-count.sh | awk -F '=' '{print $2}' | sed 's/"//g'`; do + echo $batch_id > "$batch_id.txt" + done + ls *.txt + + # save logs and scripts + tar -czf canu-logs.tar.gz workdir/canu-logs + tar -czf canu-scripts.tar.gz workdir/canu-scripts + >>> + + output { + + File logs = "canu-logs.tar.gz" + File scripts = "canu-scripts.tar.gz" + + Array[File] batch_ids = glob("*.txt") + Array[File] repartitioned_parental_reads_per_batch = glob("*.batch.tar.gz") + Array[Pair[String, File]] batch_id_and_parental_short_reads_tar = zip(batch_ids, repartitioned_parental_reads_per_batch) + + File meryl_memory = "meryl-count.memory" + File count_script = "meryl-count.sh" + File merge_script = "meryl-merge.sh" + File subtract_script = "meryl-subtract.sh" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: 600, + boot_disk_gb: 10, + preemptible_tries: 0, # explicitly turn this off as we don't save that much for the disk, and pre-emption kills us + max_retries: 0, + docker: "quay.io/broad-long-read-pipelines/canu:v1.9_wdl_patch_varibale_k" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" # LOCAL because this task is mostly IO operation + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# a hackish step to simply print out memory configuration from the above step +# this value is used for configuring the VMs for actual batch Meryl count processing +# as well as memory allocation for the merging and subtracting step +task PrintMerylMemory { + input { + File meryl_memory_file + } + + command <<< + cat ~{meryl_memory_file} + >>> + + output { + Int meryl_memory_in_GB = read_int(stdout()) + } + + runtime { + cpu: 1 + memory: "1 GiB" + docker: "ubuntu:18.04" + } +} + +# mery-count on one batch +task MerylCount { + input{ + + String workdir_name + + File batch_id_hold_file + File parental_reads_for_this_batch + + File meryl_count_script + + Int? kmerSize + Int meryl_operations_threads_est + Int meryl_memory_in_GB + + RuntimeAttr? runtime_attr_override + } + + String postfix = basename(batch_id_hold_file, ".txt") + + Int emperical_memory_lower_limit = 18 + Int memory_to_use = if (meryl_memory_in_GB < emperical_memory_lower_limit) then emperical_memory_lower_limit else meryl_memory_in_GB + + Int emperical_thread_cout = 4 # based on monitor, the task is not CPU intensive (but memory intensive), and higher available CPU improved runtime only marginally + + Int disk_space_gb = if(defined(kmerSize)) then 100 else 50 + + command <<< + set -euo pipefail + + mkdir -p workdir/canu-logs workdir/canu-scripts + mkdir -p workdir/haplotype/0-kmers/reads-Father workdir/haplotype/0-kmers/reads-Mother + + cp ~{meryl_count_script} workdir/haplotype/0-kmers/ + echo ~{meryl_memory_in_GB} > workdir/haplotype/0-kmers/meryl-count.memory + + echo "===================================" + echo "BEGIN UNPACKING REPARTITIONED PARENTAL READS TO THE DESIRED LOCATIONS" + date -u + df -h + tar xzfv ~{parental_reads_for_this_batch} -C workdir/haplotype/0-kmers/ + rm ~{parental_reads_for_this_batch} # save some disk space + df -h + date -u + tree workdir + echo "DONE UNPACKING REPARTITIONED PARENTAL READS TO THE DESIRED LOCATIONS" + echo "===================================" + + ########## + # run the script + # this is essentially the command by canu::Execution::submitOrRunParallelJob + echo "===================================" + echo "BEGIN KMER COUNTING" + date -u + n=$(cat ~{batch_id_hold_file} | awk -F '-' '{print $2}' | sed 's/^0*//') + log_name="meryl-count."~{postfix}".out" + echo "Dealing with batch: ${n}, with log name: ${log_name}" + cd workdir/haplotype/0-kmers/ && chmod +x meryl-count.sh + ./meryl-count.sh ${n} > ${log_name} 2>&1 || cat ${log_name} + echo "----------" + echo "tail log files" + tail -n 5 ${log_name} + echo "----------" + date -u + echo "Done counting, now compressing for delocalization..." + tar --use-compress-program=pigz -cf reads-~{postfix}.meryl.tar.gz reads-~{postfix}.meryl + du -sh reads-~{postfix}.meryl.tar.gz + cd - + df -h + date -u + echo "DONE KMER COUNTING" + echo "===================================" + ########## + + mv workdir/haplotype/0-kmers/reads-~{postfix}.meryl.tar.gz . + mv workdir/haplotype/0-kmers/meryl-count.~{postfix}.out . + + # save logs and scripts + tar -czf canu-logs.tar.gz workdir/canu-logs + tar -czf canu-scripts.tar.gz workdir/canu-scripts + >>> + + output { + + File logs = "canu-logs.tar.gz" + File scripts = "canu-scripts.tar.gz" + + File count_log = "meryl-count.${postfix}.out" + + File count_output = "reads-~{postfix}.meryl.tar.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: emperical_thread_cout, + mem_gb: memory_to_use, + disk_gb: disk_space_gb, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "quay.io/broad-long-read-pipelines/canu:v1.9_wdl_patch_varibale_k" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# merge the batches of the parental reads, generate one result per parent +# then subtract the k-mers +task MerylMergeAndSubtract { + input{ + + File meryl_merge_script + File meryl_subtract_script + + Array[File] meryl_count_batches + + Int meryl_operations_threads_est + Int meryl_memory_in_GB + + Boolean run_with_debug = false + + RuntimeAttr? runtime_attr_override + } + + command <<< + set -euo pipefail + + echo "===================================" + date -u + echo "BEGIN UNPACKING SCATTERED meryl-count RESULTS" + mkdir -p workdir/canu-logs workdir/canu-scripts workdir/haplotype/0-kmers/ + cp ~{meryl_merge_script} \ + ~{meryl_subtract_script} \ + workdir/haplotype/0-kmers/ + echo ~{meryl_memory_in_GB} > workdir/haplotype/0-kmers/meryl-count.memory + for zipped_count in `ls ~{sep=' ' meryl_count_batches}`; do + out_log_prefix=$(basename ${zipped_count}) + (tar xzfv ${zipped_count} -C workdir/haplotype/0-kmers/ > ${out_log_prefix}".unpacking.log" && rm -rf ${zipped_count}) & + done + wait + echo "disk use after unpacking:" + df -h + date -u + echo "DONE UNPACKING SCATTERED meryl-count RESULTS" + echo "===================================" + + ########## + # run the shell scripts without canu + echo "===================================" + date -u + cd workdir/haplotype/0-kmers/ + chmod +x meryl-merge.sh && chmod +x meryl-subtract.sh + # merge + ./meryl-merge.sh 1 > meryl-merge.000001.out 2>&1 & + ./meryl-merge.sh 2 > meryl-merge.000002.out 2>&1 & + wait + if [[ ~{run_with_debug} == true ]]; then + cat meryl-merge.000001.out + cat meryl-merge.000002.out + fi + echo "disk use after merge operation:" + df -h + du -sh * + rm -rf reads-Father-*.meryl.tar.gz # delete these count files after merge + rm -rf reads-Mother-*.meryl.tar.gz # delete these count files after merge + date -u + echo "-----------------------------------" + # subtract + ./meryl-subtract.sh 1 > meryl-subtract.000001.out 2>&1 & + ./meryl-subtract.sh 2 > meryl-subtract.000002.out 2>&1 & + wait + if [[ ~{run_with_debug} == true ]]; then + cat meryl-subtract.000001.out + cat meryl-subtract.000002.out + fi + echo "disk use after subtract operation:" + df -h + du -sh * + date -u + cd - + echo "===================================" + ########## + + mv workdir/haplotype/0-kmers/*.out . + mv workdir/haplotype/0-kmers/reads-*.statistics . + + mv workdir/haplotype/0-kmers/haplotype-Father.meryl . + mv workdir/haplotype/0-kmers/haplotype-Mother.meryl . + touch this.is.father.db && mv this.is.father.db haplotype-Father.meryl/ + touch this.is.mother.db && mv this.is.mother.db haplotype-Mother.meryl/ + + # save logs and scripts + tar -czf canu-logs.tar.gz workdir/canu-logs + tar -czf canu-scripts.tar.gz workdir/canu-scripts + >>> + + output { + File logs = "canu-logs.tar.gz" + File scripts = "canu-scripts.tar.gz" + Array [File] unpacking_logs = glob("*.unpacking.log") + + Array[File] merge_log = glob("meryl-merge.*.out") + File merge_stats_father = "reads-Father.statistics" + File merge_stats_mother = "reads-Mother.statistics" + # Array[File] merge_output_mother = glob("workdir/haplotype/0-kmers/reads-Mother.meryl/*") # we take these out for now because + # Array[File] merge_output_father = glob("workdir/haplotype/0-kmers/reads-Father.meryl/*") # these are usually deleted in the canu pipeline + + Array[File] subtract_log = glob("meryl-subtract.*.out") + Array[File] subtract_output_father = glob("haplotype-Father.meryl/*") + Array[File] subtract_output_mother = glob("haplotype-Mother.meryl/*") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2 * meryl_operations_threads_est + 6, # a bit more threads, a bit more concurrency for decompression at the beginning + mem_gb: 3 * meryl_memory_in_GB + 6, # choosing this specification so that two parallel jobs can be executed at the same time + disk_gb: 3000, # we strongly recommend you NOT change this number: 1) we've seen close to full disk peak usage, and 2) local SSD's increase by unit of 375GB, this is the maximum + boot_disk_gb: 10, + preemptible_tries: 0, # explicitly turn off as this takes a long time + max_retries: 0, + docker: "quay.io/broad-long-read-pipelines/canu:v1.9_wdl_patch_varibale_k" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Preprocessing/Guppy.wdl b/wdl/lib/Preprocessing/Guppy.wdl new file mode 100644 index 0000000..0705d08 --- /dev/null +++ b/wdl/lib/Preprocessing/Guppy.wdl @@ -0,0 +1,383 @@ +version 1.0 + +########################################################################################## +# A workflow that runs the Guppy basecaller on ONT FAST5 files. +# - The docker tag number will match the version of Guppy that is being run. You can change +# this value to run a different version of Guppy. Currently supports... [3.5.2, 3.6.0, 4.0.14] +# - All fast5 files within the given GCS dir, gcs_fast5_dir, will be processed +# - Takes a few hours to process 130GB. Best guess is that the processing time scales +# linearly but untested. +########################################################################################## + +import "../Utility/Utils.wdl" as Utils +import "../Utility/ONTUtils.wdl" as ONT +import "../../structs/Structs.wdl" + +workflow Guppy { + input { + String gcs_fast5_dir + + String config + String? barcode_kit + + String instrument = "unknown" + String flow_cell_id = "unknown" + String? protocol_run_id + String? sample_name + Int? num_shards + + String gcs_out_root_dir + } + + call ListFast5s { input: gcs_fast5_dir = gcs_fast5_dir } + + Int ns = 1 + select_first([num_shards, ceil(length(read_lines(ListFast5s.manifest))/100)]) + call ONT.PartitionManifest as PartitionFast5Manifest { input: manifest = ListFast5s.manifest, N = ns } + + scatter (chunk_index in range(length(PartitionFast5Manifest.manifest_chunks))) { + call Basecall { + input: + fast5_files = read_lines(PartitionFast5Manifest.manifest_chunks[chunk_index]), + config = config, + barcode_kit = barcode_kit, + index = chunk_index + } + } + + call Utils.Timestamp as TimestampStopped { input: dummy_dependencies = Basecall.sequencing_summary } + call Utils.Sum as SumPassingFastqs { input: ints = Basecall.num_pass_fastqs } + call Utils.Sum as SumFailingFastqs { input: ints = Basecall.num_fail_fastqs } + + call MakeSequencingSummary { input: sequencing_summaries = Basecall.sequencing_summary } + + call MakeFinalSummary { + input: + instrument = instrument, + flow_cell_id = flow_cell_id, + sample_id = select_first([sample_name, Basecall.metadata[0]['sampleid']]), + protocol_run_id = select_first([protocol_run_id, Basecall.metadata[0]['runid']]), + started = Basecall.metadata[0]['start_time'], + stopped = TimestampStopped.timestamp + } + + call Utils.Uniq as UniqueBarcodes { input: strings = flatten(Basecall.barcodes) } + + call FinalizeBasecalls { + input: + pass_fastqs = flatten(Basecall.pass_fastqs), + sequencing_summary = MakeSequencingSummary.sequencing_summary, + final_summary = MakeFinalSummary.final_summary, + barcodes = UniqueBarcodes.unique_strings, + outdir = gcs_out_root_dir + } + + output { + String gcs_dir = FinalizeBasecalls.gcs_dir + Array[String] barcodes = UniqueBarcodes.unique_strings + Int num_fast5s = length(read_lines(ListFast5s.manifest)) + Int num_pass_fastqs = SumPassingFastqs.sum + Int num_fail_fastqs = SumFailingFastqs.sum + } +} + +task ListFast5s { + input { + String gcs_fast5_dir + + RuntimeAttr? runtime_attr_override + } + + String indir = sub(gcs_fast5_dir, "/$", "") + + command <<< + gsutil ls "~{indir}/**.fast5" > fast5_files.txt + >>> + + output { + File manifest = "fast5_files.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Basecall { + input { + Array[File] fast5_files + String config = "dna_r10.4.1_e8.2_400bps_sup.cfg" + String? barcode_kit + Int index = 0 + + RuntimeAttr? runtime_attr_override + } + +# dna_r10.3_450bps_sup.cfg +# dna_r10.4.1_e8.2_260bps_sup.cfg +# dna_r10.4.1_e8.2_400bps_sup.cfg +# dna_r10.4_e8.1_sup.cfg +# dna_r9.4.1_450bps_sup.cfg +# dna_r9.4.1_450bps_sup_prom.cfg +# dna_r9.4.1_e8.1_sup.cfg +# dna_r9.5_450bps.cfg + + Int disk_size = 3 * ceil(size(fast5_files, "GB")) + + String barcode_arg = if defined(barcode_kit) then "--barcode_kits \"~{barcode_kit}\" --trim_barcodes" else "" + + command <<< + set -x + + guppy_basecaller \ + -r \ + -i /cromwell_root/ \ + -s guppy_output/ \ + -x "cuda:all" \ + -c ~{config} \ + ~{barcode_arg} \ + --compress_fastq + + # Make a list of the barcodes that were seen in the data + find guppy_output/ -name '*fastq*' -not -path '*fail*' -type f | \ + awk -F"/" '{ a=NF-1; a=$a; gsub(/pass/, "unclassified", a); print a }' | \ + sort -n | \ + uniq | tee barcodes.txt + + # Reorganize and rename the passing filter data to include the barcode in the filename + mkdir pass + find guppy_output/ -name '*fastq*' -not -path '*fail*' -type f | \ + awk -F"/" '{ a=NF-1; a=$a; b=$NF; gsub(/pass/, "unclassified", a); c=$NF; for (i = NF-1; i > 0; i--) { c=$i"/"c }; system("mv " c " pass/" a ".chunk_~{index}." b); }' + + # Reorganize and rename the failing filter data to include the barcode in the filename + mkdir fail + find guppy_output/ -name '*fastq*' -not -path '*pass*' -type f | \ + awk -F"/" '{ a=NF-1; a=$a; b=$NF; gsub(/pass/, "unclassified", a); c=$NF; for (i = NF-1; i > 0; i--) { c=$i"/"c }; system("mv " c " fail/" a ".chunk_~{index}." b); }' + + # Count passing and failing files + find pass -name '*fastq.gz' | wc -l | tee num_pass.txt + find fail -name '*fastq.gz' | wc -l | tee num_fail.txt + + # Extract relevant metadata (e.g. sample id, run id, etc.) from the first fastq file + find pass -name '*fastq.gz' -type f | \ + head -1 | \ + xargs -n1 zgrep -m1 '^@' | \ + sed 's/ /\n/g' | \ + grep -v '^@' | \ + sed 's/=/\t/g' | tee metadata.txt + >>> + + output { + Array[File] pass_fastqs = glob("pass/*.fastq.gz") + File sequencing_summary = "guppy_output/sequencing_summary.txt" + Array[String] barcodes = read_lines("barcodes.txt") + Map[String, String] metadata = read_map("metadata.txt") + Int num_pass_fastqs = read_int("num_pass.txt") + Int num_fail_fastqs = read_int("num_fail.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-guppy:6.4.6" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + gpuType: "nvidia-tesla-p100" + gpuCount: 1 + nvidiaDriverVersion: "418.152.00" + zones: ["us-central1-c", "us-central1-f", "us-east1-b", "us-east1-c", "us-west1-a", "us-west1-b"] + cpuPlatform: "Intel Haswell" + } +} + +task MakeSequencingSummary { + input { + Array[File] sequencing_summaries + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 3*ceil(size(sequencing_summaries, "GB")) + + command <<< + set -euxo pipefail + + head -1 ~{sequencing_summaries[0]} > sequencing_summary.txt + + while read p; do + awk 'NR > 1 { print }' "$p" >> sequencing_summary.txt + done <~{write_lines(sequencing_summaries)} + >>> + + output { + File sequencing_summary = "sequencing_summary.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MakeFinalSummary { + input { + String instrument + String sample_id + String flow_cell_id + String protocol_run_id + String started + String stopped + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + + command <<< + set -euxo pipefail + + echo 'instrument=~{instrument}' > final_summary.txt + echo 'flow_cell_id=~{flow_cell_id}' >> final_summary.txt + echo 'sample_id=~{sample_id}' >> final_summary.txt + echo 'protocol_run_id=~{protocol_run_id}' >> final_summary.txt + echo 'started=~{started}' >> final_summary.txt + echo 'acquisition_stopped=~{stopped}' >> final_summary.txt + echo 'processing_stopped=~{stopped}' >> final_summary.txt + echo 'basecalling_enabled=1' >> final_summary.txt + echo 'sequencing_summary_file=sequencing_summary.txt' >> final_summary.txt + >>> + + output { + File final_summary = "final_summary.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FinalizeBasecalls { + input { + Array[String] pass_fastqs + File sequencing_summary + File final_summary + Array[String] barcodes + + String outdir + + RuntimeAttr? runtime_attr_override + } + + String gcs_output_dir = sub(outdir + "/", "/+$", "") + + command <<< + set -euxo pipefail + + PASS_FASTQ="~{write_lines(pass_fastqs)}" + + while read b; do + OUT_DIR="~{gcs_output_dir}/$b" + PASS_DIR="$OUT_DIR/fastq_pass/" + + grep -w $b $PASS_FASTQ | gsutil -m cp -I $PASS_DIR + + if [ ~{length(barcodes)} -eq 1 ]; then + cp ~{sequencing_summary} sequencing_summary.$b.txt + cp ~{final_summary} final_summary.$b.txt + else + grep -w -e filename -e $b ~{sequencing_summary} > sequencing_summary.$b.txt + sed "s/sample_id=/sample_id=$b./" ~{final_summary} > final_summary.$b.txt + fi + + gsutil cp sequencing_summary.$b.txt $OUT_DIR/ + gsutil cp final_summary.$b.txt $OUT_DIR/ + done <~{write_lines(barcodes)} + >>> + + output { + String gcs_dir = gcs_output_dir + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Preprocessing/Longbow.wdl b/wdl/lib/Preprocessing/Longbow.wdl new file mode 100644 index 0000000..a62a885 --- /dev/null +++ b/wdl/lib/Preprocessing/Longbow.wdl @@ -0,0 +1,858 @@ +version 1.0 + +import "../../structs/Structs.wdl" +import "../Utility/Utils.wdl" +import "../Transcriptomics/MASSeq.wdl" as MAS + +struct LongbowModelParams { + Int umi_length + String? pre_pre_umi_seq + String? pre_umi_seq + String? pre_umi_tag + String? post_umi_seq + String? post_umi_tag +} + +workflow Process { + input { + File bam + + String prefix = "out" + String? model + + File? barcode_allow_list + String? barcode_tag + String? corrected_tag + + Int shard_width = 25 + + Boolean same_barcode_per_read = false + } + + if (!defined(model)) { call Peek as t_01_Peek { input: bam = bam } } + String lbmodel = select_first([model, t_01_Peek.model]) + + # Make a lookup table for our models for now. + # TODO: Once the UMI adjustment is part of longbow, remove this (it's redundant to Longbow): + LongbowModelParams umi_params_mas15 = { 'umi_length': 10, 'pre_pre_umi_seq': 'TCTACACGACGCTCTTCCGATCT', "pre_umi_tag": "CB", "post_umi_seq": "TTTCTTATATGGG" } + LongbowModelParams umi_params_mas15threeP = { 'umi_length': 12, 'pre_pre_umi_seq': 'TCTACACGACGCTCTTCCGATCT', "pre_umi_tag": "CB", "post_umi_seq": "TTTTTTTTTTTTT" } + LongbowModelParams umi_params_mas15BulkWithIndices = { 'umi_length': 10, "pre_umi_seq": 'TCTACACGACGCTCTTCCGATCT', "post_umi_seq": "TTTCTTATATGGG" } + LongbowModelParams umi_params_mas10 = { 'umi_length': 10, 'pre_pre_umi_seq': 'TCTACACGACGCTCTTCCGATCT', "pre_umi_tag": "CB", "post_umi_seq": "TTTCTTATATGGG" } + LongbowModelParams umi_params_slide_seq = { 'umi_length': 9, 'pre_pre_umi_seq': 'TCTTCAGCGTTCCCGAGA', "pre_umi_tag": "X1", "post_umi_seq": "TTTTTTTTTTTTT" } + LongbowModelParams umi_params_mas15teloprimev2 = { 'umi_length': 10, 'pre_pre_umi_seq': 'TCTACACGACGCTCTTCCGATCT', "pre_umi_tag": "CB", "post_umi_seq": "TTTCTTATATGGG" } + + Map[String, LongbowModelParams] longbow_umi_adjustment_params = { + 'mas_15_sc_10x5p_single_none': umi_params_mas15, + 'mas_15_sc_10x3p_single_none': umi_params_mas15threeP, + 'mas_15_bulk_10x5p_single_internal': umi_params_mas15BulkWithIndices, + 'mas_15_spatial_slide-seq_single_none': umi_params_slide_seq, + 'mas_10_sc_10x5p_single_none': umi_params_mas10, + 'mas_15_bulk_teloprimeV2_single_none': umi_params_mas15teloprimev2 + } + + call Annotate as t_02_Annotate { input: prefix = prefix, bam = bam, model = lbmodel } + call Filter as t_03_Filter { input: prefix = prefix, bam = t_02_Annotate.annotated_bam } + call Segment as t_04_Segment { input: prefix = prefix, bam = t_03_Filter.filtered_bam } + + # Here we remove "truncated" reads, which are the final reads of an array that are bounded by the end of the + # cDNA itself, rather than a known segment delimiter: + call MAS.RemoveMasSeqTruncatedReads as t_05_RemoveMasSeqTruncatedReads { input: prefix = prefix, bam = t_04_Segment.segmented_bam } + + call PadUMI as t_07_PadUMI { + input: + prefix = prefix + "_umi_padded", + bam = t_05_RemoveMasSeqTruncatedReads.non_trucated_bam, + model = lbmodel + } + + # Only call CBC code if we have a single-cell library: + if (lbmodel != "mas_15_bulk_10x5p_single_internal" && lbmodel != "mas_15_bulk_10x5p_single_internal") { + call PadCBC as t_08_PadCBC { + input: prefix = prefix + "_umi_padded_cbc_padded", + bam = t_07_PadUMI.padded_umi_bam, + model = lbmodel, + barcode_tag = barcode_tag, + } + call Correct as t_09_Correct { + input: prefix = prefix + "_umi_padded_cbc_padded_corrected", + bam = t_08_PadCBC.padded_cbc_bam, + model = lbmodel, + barcode_allow_list = barcode_allow_list, + barcode_tag = barcode_tag, + corrected_tag = corrected_tag + } + } + + File bam_for_umi_adjustment = select_first([t_09_Correct.corrected_bam, t_07_PadUMI.padded_umi_bam]) + + call MAS.AdjustUmiSequenceWithAdapterAlignment as t_10_AdjustUmiSequenceWithAdapterAlignment { + input: + prefix = prefix + "_array_elements_CBC_corrected_UMI_adjusted", + bam = bam_for_umi_adjustment, + umi_length = longbow_umi_adjustment_params[lbmodel].umi_length, + pre_pre_umi_seq = longbow_umi_adjustment_params[lbmodel].pre_pre_umi_seq, + pre_umi_seq = longbow_umi_adjustment_params[lbmodel].pre_umi_seq, + pre_umi_tag = longbow_umi_adjustment_params[lbmodel].pre_umi_tag, + post_umi_seq = longbow_umi_adjustment_params[lbmodel].post_umi_seq, + post_umi_tag = longbow_umi_adjustment_params[lbmodel].post_umi_tag, + } + + # Only call CBC code if we have a single-cell library: + if (lbmodel != "mas_15_bulk_10x5p_single_internal" && lbmodel != "mas_15_bulk_10x5p_single_internal") { + # Merge our correction stats so we can have a record of them for later: + call AggregateCorrectLogStats { + input: + longbow_correct_log_files = select_all([t_09_Correct.log]), + out_name = prefix + "_longbow_correct_stats.txt" + } + } + + call Extract { input: prefix = prefix, bam = t_10_AdjustUmiSequenceWithAdapterAlignment.umi_adjusted_bam } + + output { + # Output reads: + File annotated_bam = t_02_Annotate.annotated_bam + File segmented_bam = t_04_Segment.segmented_bam + + File filtered_bam = t_03_Filter.filtered_bam + File filter_failed_bam = t_03_Filter.filter_failed_bam + + File? corrected_bam = t_10_AdjustUmiSequenceWithAdapterAlignment.umi_adjusted_bam + File? uncorrectable_bam = t_09_Correct.uncorrected_bam + + File extracted_bam = Extract.extracted_bam + + # Output stats / logs: + File? correct_stats = AggregateCorrectLogStats.stats + File? correct_log = t_09_Correct.log + File umi_adjustment_log = t_10_AdjustUmiSequenceWithAdapterAlignment.log + } +} + +task Peek { + input { + File bam + Int n = 100 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + longbow peek -n ~{n} -o model.txt ~{bam} + >>> + + output { + String model = read_string("model.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 64, # TODO: Identify and fix `corrupted double-linked list` issue. Lots of memory is a bad bandaid. + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Annotate { + input { + File bam + String model + + String prefix = "out" + Int num_cpus = 8 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 * ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + longbow annotate -m ~{model} -t ~{num_cpus} -o ~{prefix}.annotated.bam ~{bam} + >>> + + output { + File annotated_bam = "~{prefix}.annotated.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 2*num_cpus, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Filter { + input { + File bam + File? bam_pbi + + String? model + + Int num_cpus = 2 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + String model_arg = if defined(model) then " --model " else "" + String pbi_arg = if defined(bam_pbi) then " --pbi " else "" + Int disk_size = 10 * ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + longbow filter \ + -v INFO \ + ~{pbi_arg}~{default="" bam_pbi} \ + ~{model_arg}~{default="" model} \ + ~{bam} \ + -x ~{prefix}_longbow_filter_failed.bam \ + -o ~{prefix}_longbow_filter_passed.bam 2> >(tee longbow_filter_log.txt >&2) # Get log data from stderr and reprint to stderr + >>> + + output { + File filtered_bam = "~{prefix}_longbow_filter_passed.bam" + File filter_failed_bam = "~{prefix}_longbow_filter_failed.bam" + File log = "longbow_filter_log.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 2*num_cpus, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Segment { + input { + File bam + + Int num_cpus = 2 + String prefix = "out" + + String? model + + RuntimeAttr? runtime_attr_override + } + + String model_arg = if defined(model) then " --model " else "" + Int disk_size = 10 * ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + longbow segment -v INFO ~{model_arg}~{default="" model} ~{bam} -o ~{prefix}.segmented.bam + >>> + + output { + File segmented_bam = "~{prefix}.segmented.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 2*num_cpus, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Extract { + + input { + File bam + Int num_cpus = 2 + Int base_padding = 2 + + Int? start_offset # For mas15: 16+10 + String? leading_adapter # For mas15: "10x_Adapter" + String? trailing_adapter # For mas15: "Poly_A" + + String prefix = "out" + + File? bam_pbi + + RuntimeAttr? runtime_attr_override + } + + String pbi_arg = if defined(bam_pbi) then " --pbi " else "" + String start_offset_arg = if defined(start_offset) then " --start-offset " else "" + String leading_adapter_arg = if defined(leading_adapter) then " --leading-adapter " else "" + String trailing_adapter_arg = if defined(trailing_adapter) then " --trailing-adapter " else "" + + Int disk_size = 10 * ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + source /longbow/venv/bin/activate + longbow extract \ + -v INFO \ + --base-padding ~{base_padding} \ + ~{start_offset_arg}~{default="" start_offset} \ + ~{leading_adapter_arg}~{default="" leading_adapter} \ + ~{trailing_adapter_arg}~{default="" trailing_adapter} \ + ~{pbi_arg}~{default="" bam_pbi} \ + ~{bam} \ + -o ~{prefix}.extracted.bam 2> >(tee longbow_extract_log.txt >&2) # Get log data from stderr and reprint to stderr + >>> + + output { + File extracted_bam = "~{prefix}.extracted.bam" + File log = "longbow_extract_log.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 2*num_cpus, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task PadUMI +{ + input { + File bam + String model + + String prefix = "out" + Int padding = 2 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10*ceil(size(bam, "GB")) + + Map[String, String] umi_tag = { + "mas_15_sc_10x5p_single_none": "ZU", + "mas_15_sc_10x3p_single_none": "ZU", + "mas_15_bulk_10x5p_single_internal": "ZU", + "mas_10_sc_10x5p_single_none": "ZU", + "mas_15_spatial_slide-seq_single_none": "ZU", + "mas_15_bulk_teloprimeV2_single_none": "ZU", + } + + command <<< + set -euxo pipefail + + source /longbow/venv/bin/activate + longbow pad --model ~{model} -v INFO --barcode-tag ~{umi_tag[model]} -e ~{padding} -o tmp.bam -n ~{umi_tag[model]} ~{bam} + + samtools sort tmp.bam -o ~{prefix}.umi_padded_~{padding}.bam + >>> + + output { + File padded_umi_bam = "~{prefix}.umi_padded_~{padding}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, # Decent amount of CPU and Memory because network transfer speed is proportional to VM "power" + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, # This shouldn't take very long, but it's nice to have things done quickly, so no preemption here. + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task PadCBC +{ + input { + File bam + String model + + String prefix = "out" + Int padding = 2 + + String? barcode_tag + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10*ceil(size(bam, "GB")) + + Map[String, String] barcode_tags = { + "mas_15_sc_10x5p_single_none": "CR", + "mas_15_sc_10x3p_single_none": "CR", + "mas_10_sc_10x5p_single_none": "CR", + "mas_15_bulk_teloprimeV2_single_none": "BC", + } + + String final_barcode_tag = select_first([barcode_tag,barcode_tags[model]]) + + command <<< + set -euxo pipefail + + source /longbow/venv/bin/activate + longbow pad --model ~{model} -v INFO --barcode-tag ~{final_barcode_tag} -e ~{padding} -o tmp.bam -n ~{final_barcode_tag} ~{bam} + + samtools sort tmp.bam -o ~{prefix}.cbc_padded_~{padding}.bam + >>> + + output { + File padded_cbc_bam = "~{prefix}.cbc_padded_~{padding}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, # Decent amount of CPU and Memory because network transfer speed is proportional to VM "power" + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, # This shouldn't take very long, but it's nice to have things done quickly, so no preemption here. + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task TagFix +{ + input { + File bam + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + source /longbow/venv/bin/activate + longbow tagfix -t${np} -v INFO -o tmp.bam ~{bam} + + samtools sort -@${np} tmp.bam -o ~{prefix}.alignment_tags_fixed.bam + >>> + + output { + File tag_fixed_bam = "~{prefix}.alignment_tags_fixed.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, # Decent amount of CPU and Memory because network transfer speed is proportional to VM "power" + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, # This shouldn't take very long, but it's nice to have things done quickly, so no preemption here. + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Correct { + input { + File bam + String model + + Int ccs_lev_dist_threshold = 2 + Int clr_lev_dist_threshold = 2 + + File? barcode_allow_list + String? barcode_tag + String? corrected_tag + + File? barcode_freq_list + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + String barcode_freq_arg = if defined(barcode_freq_list) then " --barcode-freqs " else "" + + Int disk_size = 10 * ceil(size(bam, "GB")) + + Map[String, String] allowlists = { + "mas_15_sc_10x5p_single_none": "/longbow/resources/barcodes/cellranger/737K-august-2016.txt", + "mas_15_sc_10x3p_single_none": "/longbow/resources/barcodes/cellranger/3M-february-2018.txt.gz", + "mas_15_bulk_10x5p_single_internal": "/longbow/resources/barcodes/cellranger/737K-august-2016.txt", + "mas_10_sc_10x5p_single_none": "/longbow/resources/barcodes/cellranger/737K-august-2016.txt", + "mas_15_bulk_teloprimeV2_single_none": "/longbow/resources/barcodes/indexes/tpv2.indices.txt", + } + + Map[String, String] machine_memory = { + "mas_15_sc_10x5p_single_none": 32, + "mas_15_sc_10x3p_single_none": 64, + "mas_15_bulk_10x5p_single_internal": 32, + "mas_10_sc_10x5p_single_none": 32, + "mas_15_bulk_teloprimeV2_single_none": 32, + } + + Map[String, String] barcode_tags = { + "mas_15_sc_10x5p_single_none": "CR", + "mas_15_sc_10x3p_single_none": "CR", + "mas_15_bulk_10x5p_single_internal": "CR", + "mas_10_sc_10x5p_single_none": "CR", + "mas_15_bulk_teloprimeV2_single_none": "BC", + } + + Map[String, String] corrected_tags = { + "mas_15_sc_10x5p_single_none": "CB", + "mas_15_sc_10x3p_single_none": "CB", + "mas_15_bulk_10x5p_single_internal": "CB", + "mas_10_sc_10x5p_single_none": "CB", + "mas_15_bulk_teloprimeV2_single_none": "BC", + } + + # Resolve allow list and barcode tags based on inputs and model: + String final_barcode_tag = select_first([barcode_tag,barcode_tags[model]]) + String final_corrected_tag = select_first([corrected_tag,corrected_tags[model]]) + + command <<< + set -euxo pipefail + + # For some reason this the allow list specification isn't working in the case of a supplied list. + # We have to do some cleverness here to make it work: + user_specified_allow_list="~{barcode_allow_list}" + if [[ "${#user_specified_allow_list}" -gt 0 ]] ; then + allow_list=~{barcode_allow_list} + else + allow_list=~{allowlists[model]} + fi + + # NOTE: We can only use 1 thread here because the index will be built independently on each thread, + # and the index takes a LOT of memory. + longbow correct \ + -t 1 \ + --model ~{model} \ + --allow-list ${allow_list} \ + ~{barcode_freq_arg}~{default="" barcode_freq_list} \ + -v INFO \ + --barcode-tag ~{final_barcode_tag} \ + --corrected-tag ~{final_corrected_tag} \ + --max-hifi-dist ~{ccs_lev_dist_threshold} \ + --max-clr-dist ~{clr_lev_dist_threshold} \ + -o ~{prefix}.corrected.bam \ + --barcode-uncorrectable-bam ~{prefix}.uncorrected_barcodes.bam \ + ~{bam} 2>&1 | tee longbow_correct.~{prefix}.log + >>> + + output { + File corrected_bam = "~{prefix}.corrected.bam" + File uncorrected_bam = "~{prefix}.uncorrected_barcodes.bam" + File log = "longbow_correct.~{prefix}.log" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: machine_memory[model], # MUST be this big because of the symspell barcode index. + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Stats { + input { + File bam + + String prefix = "stats" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 * ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + longbow stats -o ~{prefix} ~{bam} + >>> + + output { + Array[File] pngs = glob("*.png") + Array[File] svgs = glob("*.svg") + File summary = "~{prefix}_summary_stats.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Demultiplex { + input { + File bam + String tag + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 4 * ceil(size(bam, "GB")) + + String basename = basename(bam, ".bam") + + # TODO: We should do this with 2 passes so that after we've ID'd the models involved, we go back and can refine the demux. This is mostly for CLR reads. + + command <<< + set -euxo pipefail + + longbow demultiplex -d ~{tag} -o ~{basename} + >>> + + output { + Array[File] demuxed_bams = glob("~{basename}*.bam") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task AggregateCorrectLogStats +{ + input { + Array[File] longbow_correct_log_files + + String out_name = "longbow_correct_stats.txt" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(longbow_correct_log_files, "GB")) + + # YES, this SHOULD be a proper tool, but right now it isn't. + command <<< +python << CODE +import os + +stats_dict = dict() +line_key = "STATS: " + +for stats_file in ["~{sep='","' longbow_correct_log_files}"]: + with open(stats_file, 'r') as f: + for line in f: + if line_key in line: + line = line.strip() + s = line[line.find(line_key) + len(line_key):] + key, remainder = [t.strip() for t in s.split(":")] + if "/" in remainder: + count = int(remainder[:remainder.find("/")]) + tot = int(remainder[remainder.find("/")+1:remainder.find(" ")]) + else: + count = int(remainder) + tot = None + + try: + c, t = stats_dict[key] + if tot is not None: + tot += t + stats_dict[key] = (count + c, tot) + except KeyError: + stats_dict[key] = (count, tot) + +k_len = 0 +for k in stats_dict.keys(): + if len(k) > k_len: + k_len = len(k) + +k_prefix = list(stats_dict.keys())[0] +k_prefix = k_prefix[:k_prefix.find(" ")] +with open("~{out_name}", 'w') as f: + for k, v in stats_dict.items(): + + if not k.startswith(k_prefix): + f.write("\n") + k_prefix = k[:k.find(" ")] + + k_spacing = k_len - len(k) + + count, tot = v + if tot is None or tot == 0: + f.write(f"{k}:{' '*k_spacing} {count}\n") + print(f"WARNING: tot == {tot}") + else: + f.write(f"{k}:{' '*k_spacing} {count}/{tot} ({100.0*count/tot:2.4f}%)\n") + +CODE + >>> + + output { + File stats = out_name + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, # Decent amount of CPU and Memory because network transfer speed is proportional to VM "power" + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, # This shouldn't take very long, but it's nice to have things done quickly, so no preemption here. + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.40" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + diff --git a/wdl/lib/Preprocessing/Medaka.wdl b/wdl/lib/Preprocessing/Medaka.wdl new file mode 100644 index 0000000..79735eb --- /dev/null +++ b/wdl/lib/Preprocessing/Medaka.wdl @@ -0,0 +1,89 @@ +version 1.0 + +########################################################################################## +# Runs Medaka on an ONT draft assembly with GUPPY basecalled ONT reads +# - Runs within a few hours with 18GB basecalled_reads and a 23Mb genome +########################################################################################## + +import "../../structs/Structs.wdl" + +task MedakaPolish { + input { + File basecalled_reads + File draft_assembly + + String prefix = "consensus" + String model = "r941_prom_high_g360" + Int n_rounds = 3 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + basecalled_reads: "basecalled reads to be used with polishing" + draft_assembly: "draft assembly to be polished" + prefix: "prefix for output files" + model: "run `medaka tools list_models` and pick string with the correct pore type, machine, and guppy version" + n_rounds: "number of polishing rounds to apply" + } + + Int disk_size = 4 * n_rounds * ceil(size([basecalled_reads, draft_assembly], "GB")) + + ### + # Medaka models + # Available: r103_min_high_g345, r103_min_high_g360, r103_prom_high_g360, r103_prom_snp_g3210, r103_prom_variant_g3210, + # r10_min_high_g303, r10_min_high_g340, + # r941_min_fast_g303, r941_min_high_g303, r941_min_high_g330, r941_min_high_g340_rle, r941_min_high_g344, + # r941_min_high_g351, r941_min_high_g360 + # r941_prom_fast_g303, r941_prom_high_g303, r941_prom_high_g330, r941_prom_high_g344, r941_prom_high_g360, + # r941_prom_snp_g303, r941_prom_snp_g322, r941_prom_snp_g360, + # r941_prom_variant_g303, r941_prom_variant_g322, r941_prom_variant_g360 + ### + + command <<< + source /medaka/venv/bin/activate + + set -euxo pipefail + + mkdir output_0_rounds + cp ~{draft_assembly} output_0_rounds/consensus.fasta + + for i in {1..~{n_rounds}} + do + medaka_consensus -i ~{basecalled_reads} -d output_$((i-1))_rounds/consensus.fasta -o output_${i}_rounds -t 8 -m ~{model} + done + + cp output_~{n_rounds}_rounds/consensus.fasta ~{prefix}.fasta + >>> + + output { + File polished_assembly = "~{prefix}.fasta" + } + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 8, + mem_gb: 24, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-medaka:0.1.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + gpuType: "nvidia-tesla-t4" + gpuCount: 1 + nvidiaDriverVersion: "418.152.00" + zones: ["us-east1-c"] + cpuPlatform: "Intel Haswell" + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + diff --git a/wdl/lib/QC/AlignedMetrics.wdl b/wdl/lib/QC/AlignedMetrics.wdl new file mode 100644 index 0000000..4a820e7 --- /dev/null +++ b/wdl/lib/QC/AlignedMetrics.wdl @@ -0,0 +1,609 @@ +version 1.0 + +import "../../structs/Structs.wdl" +import "../Utility/Finalize.wdl" as FF + +workflow AlignedMetrics { + input { + File aligned_bam + File aligned_bai + + File ref_fasta + File ref_dict + + String? gcs_output_dir + } + + call ReadMetrics as AlignedReadMetrics { input: bam = aligned_bam } + + call MakeChrIntervalList { input: ref_dict = ref_dict } + + scatter (chr_info in MakeChrIntervalList.chrs) { + call MosDepth { + input: + bam = aligned_bam, + bai = aligned_bai, + chr = chr_info[0] + } + + call SummarizeDepth { input: regions = MosDepth.regions } + } + + call FlagStats as AlignedFlagStats { input: bam = aligned_bam } + + if (defined(gcs_output_dir)) { + String outdir = sub(gcs_output_dir + "", "/$", "") + + call FF.FinalizeToDir as FFYieldAligned { + input: + outdir = outdir + "/yield_aligned/", + files = [ + AlignedFlagStats.flag_stats, + AlignedReadMetrics.np_hist, + AlignedReadMetrics.range_gap_hist, + AlignedReadMetrics.zmw_hist, + AlignedReadMetrics.prl_counts, + AlignedReadMetrics.prl_hist, + AlignedReadMetrics.prl_nx, + AlignedReadMetrics.prl_yield_hist, + AlignedReadMetrics.rl_counts, + AlignedReadMetrics.rl_hist, + AlignedReadMetrics.rl_nx, + AlignedReadMetrics.rl_yield_hist + ] + } + + call FF.FinalizeToDir as FFCoverageFullDist { input: outdir = outdir + "/coverage/", files = MosDepth.full_dist } + call FF.FinalizeToDir as FFCoverageGlobalDist { input: outdir = outdir + "/coverage/", files = MosDepth.global_dist } + call FF.FinalizeToDir as FFCoverageRegionDist { input: outdir = outdir + "/coverage/", files = MosDepth.region_dist } + call FF.FinalizeToDir as FFCoverageRegions { input: outdir = outdir + "/coverage/", files = MosDepth.regions } + call FF.FinalizeToDir as FFCoverageRegionsCsi { input: outdir = outdir + "/coverage/", files = MosDepth.regions_csi } + call FF.FinalizeToDir as FFCoverageQuantizedDist { input: outdir = outdir + "/coverage/", files = MosDepth.quantized_dist } + call FF.FinalizeToDir as FFCoverageQuantized { input: outdir = outdir + "/coverage/", files = MosDepth.quantized } + call FF.FinalizeToDir as FFCoverageQuantizedCsi { input: outdir = outdir + "/coverage/", files = MosDepth.quantized_csi } + + call FF.FinalizeToDir as FFDepthSummaries { input: outdir = outdir + "/coverage_summaries/", files = SummarizeDepth.cov_summary } + } + + output { + File aligned_flag_stats = AlignedFlagStats.flag_stats + + Array[File] coverage_full_dist = MosDepth.full_dist + Array[File] coverage_global_dist = MosDepth.global_dist + Array[File] coverage_region_dist = MosDepth.region_dist + Array[File] coverage_regions = MosDepth.regions + Array[File] coverage_regions_csi = MosDepth.regions_csi + Array[File] coverage_quantized_dist = MosDepth.quantized_dist + Array[File] coverage_quantized = MosDepth.quantized + Array[File] coverage_quantized_csi = MosDepth.quantized_csi + + File aligned_np_hist = AlignedReadMetrics.np_hist + File aligned_range_gap_hist = AlignedReadMetrics.range_gap_hist + File aligned_zmw_hist = AlignedReadMetrics.zmw_hist + File aligned_prl_counts = AlignedReadMetrics.prl_counts + File aligned_prl_hist = AlignedReadMetrics.prl_hist + File aligned_prl_nx = AlignedReadMetrics.prl_nx + File aligned_prl_yield_hist = AlignedReadMetrics.prl_yield_hist + File aligned_rl_counts = AlignedReadMetrics.rl_counts + File aligned_rl_hist = AlignedReadMetrics.rl_hist + File aligned_rl_nx = AlignedReadMetrics.rl_nx + File aligned_rl_yield_hist = AlignedReadMetrics.rl_yield_hist + + File raw_chr_intervals = MakeChrIntervalList.raw_chrs + } +} + +task MakeChrIntervalList { + input { + File ref_dict + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + + command <<< + set -euxo pipefail + + grep '^@SQ' ~{ref_dict} | awk '{ print $2 "\t" 1 "\t" $3 }' | sed 's/[SL]N://g' | grep -v -e random -e chrUn -e decoy -e alt -e HLA -e EBV > chrs.txt + >>> + + output { + Array[Array[String]] chrs = read_tsv("chrs.txt") + File raw_chrs = "chrs.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MosDepth { + input { + File bam + File bai + String chr + Int? window_size + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) + Int ws = select_first([window_size, 500]) + String basename = basename(bam, ".bam") + String prefix = "~{basename}.coverage.~{chr}" + + command <<< + set -euxo pipefail + + mosdepth -t 4 -c "~{chr}" -n -x -Q 1 ~{prefix}.full ~{bam} + mosdepth -t 4 -c "~{chr}" -n -x -Q 1 -b ~{ws} ~{prefix} ~{bam} + + export MOSDEPTH_Q0=NO_COVERAGE # 0 -- defined by the arguments to --quantize + export MOSDEPTH_Q1=LOW_COVERAGE # 1..4 + export MOSDEPTH_Q2=CALLABLE # 5..149 + export MOSDEPTH_Q3=HIGH_COVERAGE # 150 ... + + mosdepth -t 4 -c "~{chr}" -n -x -Q 1 --quantize 0:1:5:150: ~{prefix}.quantized ~{bam} + >>> + + output { + File full_dist = "~{prefix}.full.mosdepth.global.dist.txt" + File global_dist = "~{prefix}.mosdepth.global.dist.txt" + File region_dist = "~{prefix}.mosdepth.region.dist.txt" + File regions = "~{prefix}.regions.bed.gz" + File regions_csi = "~{prefix}.regions.bed.gz.csi" + File quantized_dist = "~{prefix}.quantized.mosdepth.global.dist.txt" + File quantized = "~{prefix}.quantized.quantized.bed.gz" + File quantized_csi = "~{prefix}.quantized.quantized.bed.gz.csi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-mosdepth:0.3.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MosDepthOverBed { + input { + File bam + File bai + File bed + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) + String basename = basename(bam, ".bam") + String bedname = basename(bed, ".bed") + String prefix = "~{basename}.coverage_over_bed.~{bedname}" + + command <<< + set -euxo pipefail + + mosdepth -t 4 -b ~{bed} -n -x -Q 1 ~{prefix} ~{bam} + >>> + + output { + File global_dist = "~{prefix}.mosdepth.global.dist.txt" + File region_dist = "~{prefix}.mosdepth.region.dist.txt" + File regions = "~{prefix}.regions.bed.gz" + File regions_csi = "~{prefix}.regions.bed.gz.csi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "quay.io/biocontainers/mosdepth:0.2.4--he527e40_0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SummarizeDepth { + input { + File regions + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(regions, "GB")) + String chrName = sub(basename(regions, ".regions.bed.gz"), "out.coverage.", "") + + command <<< + set -euxo pipefail + + ((echo 'chr start stop cov_mean cov_sd cov_q1 cov_median cov_q3 cov_iqr') && \ + (zcat ~{regions} | datamash first 1 first 2 last 3 mean 4 sstdev 4 q1 4 median 4 q3 4 iqr 4)) | \ + column -t > ~{chrName}.summary.txt + >>> + + output { + File cov_summary = "~{chrName}.summary.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CoverageTrack { + input { + File bam + File bai + String chr + String start + String end + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB")) + + command <<< + set -euxo pipefail + + samtools depth -a ~{bam} -r ~{chr}:~{start}-~{end} | bgzip > ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz + tabix -p bed ~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz + >>> + + output { + File coverage = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz" + File coverage_tbi = "~{basename}.coverage.~{chr}_~{start}_~{end}.txt.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FlagStats { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + samtools flagstat ~{bam} > ~{basename}.flag_stats.txt + >>> + + output { + File flag_stats = "~{basename}.flag_stats.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ReadNamesAndLengths { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + samtools view ~{bam} | awk '{ print $1, length($10) }' | gzip -1 > ~{basename}.read_names_and_lengths.txt.gz + >>> + + output { + File read_names_and_lengths = "~{basename}.read_names_and_lengths.txt.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterMQ0Reads { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB")) + String prefix = basename(bam, ".bam") + + command <<< + set -euxo pipefail + + samtools view -q 1 -b ~{bam} > ~{prefix}.no_mq0.bam + samtools index ~{prefix}.no_mq0.bam + >>> + + output { + File no_mq0_bam = "~{prefix}.no_mq0.bam" + File no_mq0_bai = "~{prefix}.no_mq0.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ComputeBedCoverage { + input { + File bam + File bai + File bed + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB") + size(bai, "GB") + size(bed, "GB")) + + command <<< + set -euxo pipefail + + bedtools coverage -b ~{bed} -a ~{bam} -nobuf | gzip > ~{prefix}.txt.gz + zcat ~{prefix}.txt.gz | awk '{ sum += sprintf("%f", $15*$16) } END { printf("%f\n", sum) }' > ~{prefix}.count.txt + >>> + + output { + File coverage = "~{prefix}.txt.gz" + Float counts = read_float("~{prefix}.count.txt") + File counts_file = "~{prefix}.count.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ReadMetrics { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + java -jar /usr/local/bin/gatk.jar ComputeLongReadMetrics -I ~{bam} -O ~{basename}.read_metrics -DF WellformedReadFilter + >>> + + output { + File np_hist = "~{basename}.read_metrics.np_hist.txt" + File range_gap_hist = "~{basename}.read_metrics.range_gap_hist.txt" + File zmw_hist = "~{basename}.read_metrics.zmw_hist.txt" + File prl_counts = "~{basename}.read_metrics.prl_counts.txt" + File prl_hist = "~{basename}.read_metrics.prl_hist.txt" + File prl_nx = "~{basename}.read_metrics.prl_nx.txt" + File prl_yield_hist = "~{basename}.read_metrics.prl_yield_hist.txt" + File rl_counts = "~{basename}.read_metrics.rl_counts.txt" + File rl_hist = "~{basename}.read_metrics.rl_hist.txt" + File rl_nx = "~{basename}.read_metrics.rl_nx.txt" + File rl_yield_hist = "~{basename}.read_metrics.rl_yield_hist.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 50, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task BamToBed { + input { + File bam + File bai + + RuntimeAttr? runtime_attr_override + } + + String bed = basename(bam, ".bam") + ".bed" + Int disk_size = 4*ceil(size(bam, "GB") + size(bai, "GB")) + + command <<< + set -euxo pipefail + + bedtools bamtobed -i ~{bam} > ~{bed} + >>> + + output { + File bedfile = bed + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/QC/CollectPacBioAlignedMetrics.wdl b/wdl/lib/QC/CollectPacBioAlignedMetrics.wdl new file mode 100644 index 0000000..9c4e580 --- /dev/null +++ b/wdl/lib/QC/CollectPacBioAlignedMetrics.wdl @@ -0,0 +1,77 @@ +version 1.0 + +import "../Utility/PBUtils.wdl" as PB +import "../Visualization/NanoPlot.wdl" as NP + +workflow CollectPacBioAlignedMetrics { + meta { + desciption: + "Collect a few custom metrics on the alignments." + } + + input { + File aligned_bam + File aligned_bai + File aligned_pbi + } + + parameter_meta { + custom_aln_metrics_summary: "A 2-col TSV holding custom metrics on the alignments: 1-col is attribute name, 2-col is value." + nanoplot_stats: "Nanoplot stats file (NanoStats.txt) on the input BAM." + nanoplot_pngs: "Nanoplot figures on the input BAM." + } + + # note: this may not matter anymore if the input bam is HiFi only? + call PB.SummarizePBI as SummarizeAlignedPBI { input: pbi = aligned_pbi } + call PB.SummarizePBI as SummarizeAlignedQ5PBI { input: pbi = aligned_pbi, qual_threshold = 5 } + call PB.SummarizePBI as SummarizeAlignedQ7PBI { input: pbi = aligned_pbi, qual_threshold = 7 } + call PB.SummarizePBI as SummarizeAlignedQ10PBI { input: pbi = aligned_pbi, qual_threshold = 10 } + call PB.SummarizePBI as SummarizeAlignedQ12PBI { input: pbi = aligned_pbi, qual_threshold = 12 } + call PB.SummarizePBI as SummarizeAlignedQ15PBI { input: pbi = aligned_pbi, qual_threshold = 15 } + + call NP.NanoPlotFromBam { input: bam = aligned_bam, bai = aligned_bai } + + call CustomMetricsSummaryToFile { + input: + attributes = ["num_reads_Q5", "num_reads_Q7", "num_reads_Q10", "num_reads_Q12", "num_reads_Q15", + "aligned_num_reads", "aligned_num_bases", "aligned_frac_bases", + "aligned_read_length_mean", "aligned_read_length_median", "aligned_read_length_stdev", "aligned_read_length_N50", + "average_identity", "median_identity"], + values = [SummarizeAlignedQ5PBI.results['reads'], SummarizeAlignedQ7PBI.results['reads'], SummarizeAlignedQ10PBI.results['reads'], SummarizeAlignedQ12PBI.results['reads'], SummarizeAlignedQ15PBI.results['reads'], + NanoPlotFromBam.stats_map['number_of_reads'], NanoPlotFromBam.stats_map['number_of_bases_aligned'], NanoPlotFromBam.stats_map['fraction_bases_aligned'], + NanoPlotFromBam.stats_map['mean_read_length'], NanoPlotFromBam.stats_map['median_read_length'], NanoPlotFromBam.stats_map['read_length_stdev'], NanoPlotFromBam.stats_map['n50'], + NanoPlotFromBam.stats_map['average_identity'], NanoPlotFromBam.stats_map['median_identity']] + } + + output { + File custom_aln_metrics_summary = CustomMetricsSummaryToFile.custom_aln_metrics_summary + File nanoplot_stats = NanoPlotFromBam.stats + Array[File] nanoplot_pngs = NanoPlotFromBam.plots + } +} + +task CustomMetricsSummaryToFile { + meta { + desciption: + "Format a few custom metrics on the alignments into a 2-col TSV: 1-col is attribute name, 2-col is value." + } + + input { + Array[String] attributes + Array[String] values + } + + command <<< + set -eux + paste ~{write_lines(attributes)} ~{write_lines(values)} > "alignment.metrics.tsv" + >>> + + output { + File custom_aln_metrics_summary = "alignment.metrics.tsv" + } + + runtime { + disks: "local-disk 100 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} diff --git a/wdl/lib/QC/CollectSMRTCellUnalignedMetrics.wdl b/wdl/lib/QC/CollectSMRTCellUnalignedMetrics.wdl new file mode 100644 index 0000000..87f9950 --- /dev/null +++ b/wdl/lib/QC/CollectSMRTCellUnalignedMetrics.wdl @@ -0,0 +1,31 @@ +version 1.0 + +import "../Utility/PBUtils.wdl" as PB +import "../Utility/Utils.wdl" + +workflow CollectSMRTCellUnalignedMetrics { + input { + File smrtcell_pbi + } + + call PB.SummarizePBI { input: pbi = smrtcell_pbi, runtime_attr_override = { 'mem_gb': 72 } } + call Utils.MapToTsv {input: my_map = SummarizePBI.results, name_of_file = "pbi.summary.tsv" } + + output { + + File pbi_summary = MapToTsv.result + + Float polymerase_read_length_mean = SummarizePBI.results['polymerase_mean'] + Float polymerase_read_length_N50 = SummarizePBI.results['polymerase_n50'] + Float subread_read_length_mean = SummarizePBI.results['subread_mean'] + Float subread_read_length_N50 = SummarizePBI.results['subread_n50'] + Float num_reads = SummarizePBI.results['reads'] + Float num_bases = SummarizePBI.results['bases'] + Float read_length_mean = SummarizePBI.results['subread_mean'] + Float read_length_median = SummarizePBI.results['subread_median'] + Float read_length_stdev = SummarizePBI.results['subread_stdev'] + Float read_length_N50 = SummarizePBI.results['subread_n50'] + Float read_qual_mean = SummarizePBI.results['mean_qual'] + Float read_qual_median = SummarizePBI.results['median_qual'] + } +} diff --git a/wdl/lib/QC/FPCheckAoU.wdl b/wdl/lib/QC/FPCheckAoU.wdl new file mode 100644 index 0000000..726c987 --- /dev/null +++ b/wdl/lib/QC/FPCheckAoU.wdl @@ -0,0 +1,173 @@ +version 1.0 + +import "../QC/Fingerprinting.wdl" as FPUtils +import "../Utility/VariantUtils.wdl" + +workflow FPCheckAoU { + + meta { + description: + "Check correctness of metadata on a (demultiplexed) alignmed BAM, by genotyping it's BAM generated with its metadata, against a fingerprint VCF. Practically assumes human GRCh38 reference." + } + + input { + File aligned_bam + File aligned_bai + + String fp_store + String sample_id_at_store + + File ref_specific_haplotype_map + + Float lod_pass_threshold = 6.0 + Float lod_fail_threshold = -3.0 + } + + parameter_meta { + aligned_bam: "GCS path to aligned BAM file, supposed to be of the same sample as from the fingerprinting (FP) VCF" + + fp_store: "Name of the bucket and prefix holding the fingerprint VCFs." + sample_id_at_store: "UUID of the sample at the fingerprint store, used to fetch the fingerprinting VCF" + + ref_specific_haplotype_map: "Happlotype map file for the reference build used. See https://bit.ly/3QyZbwt " + + lod_expected_sample: "An LOD score assuming the BAM is the same sample as the FP VCF, i.e. BAM sourced from the 'expected sample'." + + lod_pass_threshold: "A numeric threshold for LOD above which the sample will be considered passing the FP check." + lod_fail_threshold: "A numeric threshold for LOD below which the sample will be considered failing the FP check." + + FP_status: "A single word summary on the result of FP check; one of [PASS, FAIL, BORDERLINE]." + fingerprint_summary: "A file holding the summaries of LOD (a bit more detail than pass/fail)." + fingerprint_details: "A file holding the detailed LOD at each FP site." + } + + + ##### Prep work + call ResolveFPVCFPath {input: fp_store = fp_store, sample_id_at_store = sample_id_at_store} + call ReheaderFullGRCh38VCFtoNoAlt {input: full_GRCh38_vcf = ResolveFPVCFPath.fp_vcf} + + call VariantUtils.GetVCFSampleName { + input: + fingerprint_vcf = ReheaderFullGRCh38VCFtoNoAlt.reheadered_vcf + } + call FPUtils.FilterGenotypesVCF { + input: + fingerprint_vcf = ReheaderFullGRCh38VCFtoNoAlt.reheadered_vcf + } + call FPUtils.ExtractGenotypingSites { + input: + fingerprint_vcf = FilterGenotypesVCF.ready_to_use_vcf + } + call FPUtils.ExtractRelevantGenotypingReads { + input: + aligned_bam = aligned_bam, + aligned_bai = aligned_bai, + genotyping_sites_bed = ExtractGenotypingSites.sites, + } + + ##### check + call FPUtils.CheckFingerprint { + input: + aligned_bam = ExtractRelevantGenotypingReads.relevant_reads, + aligned_bai = ExtractRelevantGenotypingReads.relevant_reads_bai, + fingerprint_vcf = FilterGenotypesVCF.ready_to_use_vcf, + vcf_sample_name = GetVCFSampleName.sample_name, + haplotype_map = ref_specific_haplotype_map + } + + ##### wrapup + Float lod_expected_sample_t = CheckFingerprint.metrics_map['LOD_EXPECTED_SAMPLE'] + + String status = if(lod_expected_sample_t < lod_fail_threshold) then "FAIL" else if (lod_expected_sample_t > lod_pass_threshold) then "PASS" else "BORDERLINE" + + output { + Float lod_expected_sample = lod_expected_sample_t + String FP_status = status + + File fingerprint_summary = CheckFingerprint.summary_metrics + File fingerprint_details = CheckFingerprint.detail_metrics + } +} + +task ResolveFPVCFPath { + meta { + desciption: + "Find the fingerprint VCF at the fingerprint store; project specific." + } + + input { + String fp_store + String sample_id_at_store + RuntimeAttr? runtime_attr_override + } + + String fp_store_formatted = sub(fp_store, "/$", "") + + command <<< + set -eux + + # note the addition of the wildcard character * + FP_SEARCH="~{fp_store_formatted}/~{sample_id_at_store}*.fingerprint.liftedover.vcf" + # this will error if no paths match, i.e. no FP file exists with this sample_id_at_store + FP_PATH=$(gsutil ls "${FP_SEARCH}" | head -n 1) + FP_INDEX_PATH="${FP_PATH}.idx" + + echo "${FP_PATH}" > "vcf.gspath" + echo "${FP_INDEX_PATH}" > "index.gspath" + >>> + + output { + String fp_vcf = read_string("vcf.gspath") + String fp_vcf_idx = read_string("index.gspath") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: 100, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ReheaderFullGRCh38VCFtoNoAlt { + meta { + desciption: + "Reheader the fingperint VCF that's generated with full GRCh38 reference to the no_alt header; project specific." + } + + input { + File full_GRCh38_vcf + } + + command <<< + set -eux + + grep -vF "_decoy,length=" ~{full_GRCh38_vcf} | \ + grep -vF "_alt,length=" | \ + grep -v "^##contig= "reheadered.fp.vcf" + >>> + + output { + File reheadered_vcf = "reheadered.fp.vcf" + } + + runtime { + disks: "local-disk 100 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} diff --git a/wdl/lib/QC/Fingerprinting.wdl b/wdl/lib/QC/Fingerprinting.wdl new file mode 100644 index 0000000..68eb03a --- /dev/null +++ b/wdl/lib/QC/Fingerprinting.wdl @@ -0,0 +1,503 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task ListGenotypedVCFs { + input { + String fingerprint_store + } + + String bucket_dir = sub(fingerprint_store, "/$", "") + + command <<< + set -eux + + gsutil ls -r ~{bucket_dir}/**.vcf.gz > all.vcfs.txt + >>> + + output { + File vcf_gs_paths = "all.vcfs.txt" + } + + ################### + runtime { + cpu: 2 + memory: "4 GiB" + disks: "local-disk 50 HDD" + bootDiskSizeGb: 10 + docker:"us.gcr.io/broad-dsp-lrma/lr-basic:latest" + } +} + +task PickGenotypeVCF { + input { + File fingerprinting_vcf_gs_paths + String? vcf_name + } + + parameter_meta { + fingerprinting_vcf_gs_paths: "A file holding GS paths to fingerprinting GT'ed VCFs" + vcf_name: "an expression used for picking up VCFs, the filter will be applied to VCF names, a match will lead to the VCF to be included" + } + + Boolean filter = defined(vcf_name) + + command <<< + set -eux + + if ~{filter}; then + grep "~{vcf_name}$" ~{fingerprinting_vcf_gs_paths} > vcfs.txt + else + cp ~{fingerprinting_vcf_gs_paths} vcfs.txt + fi + >>> + + output { + Array[String] vcfs = if filter then [read_string("vcfs.txt")] else read_lines("vcfs.txt") + } + + ################### + runtime { + cpu: 2 + memory: "4 GiB" + disks: "local-disk 50 HDD" + bootDiskSizeGb: 10 + docker:"gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task FilterGenotypesVCF { + input { + File fingerprint_vcf + Array[String] filters = ['_random\\t', '_decoy\\t', '_alt\\t', '^chrUn', '^HLA', '^EBV'] + } + + parameter_meta { + filters: "An array of chromosome names to filter out when verifying fingerprints" + } + + command <<< + set -eux + + GREPCMD="grep" + if [[ ~{fingerprint_vcf} =~ \.gz$ ]]; then + GREPCMD="zgrep" + fi + "${GREPCMD}" \ + -v \ + -e ' placeholder ' \ + ~{true='-e' false='' length(filters) > 0} \ + ~{sep=" -e " filters} \ + ~{fingerprint_vcf} \ + > fingerprint.fixed.vcf + >>> + + output { + File ready_to_use_vcf = "fingerprint.fixed.vcf" + } + + ################### + runtime { + cpu: 2 + memory: "4 GiB" + disks: "local-disk 50 HDD" + bootDiskSizeGb: 10 + preemptible_tries: 3 + max_retries: 2 + docker:"gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task ExtractGenotypingSites { + input { + File fingerprint_vcf + } + + command <<< + + set -eux + + GREPCMD="grep" + if [[ ~{fingerprint_vcf} =~ \.gz$ ]]; then + GREPCMD="zgrep" + fi + + "${GREPCMD}" -v "^#" ~{fingerprint_vcf} | \ + awk 'BEGIN {OFS="\t"} {print $1, $2-1, $2, $3}' \ + > genotyping.sites.bed + >>> + + output { + File sites = "genotyping.sites.bed" + } + + ################### + runtime { + cpu: 2 + memory: "4 GiB" + disks: "local-disk 50 HDD" + bootDiskSizeGb: 10 + preemptible_tries: 3 + max_retries: 2 + docker:"gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task MergeGenotypingSites { + input { + Array[File] all_sites + } + + command <<< + + set -eux + cat ~{sep=' ' all_sites} | sort | uniq > "genotyping.sites.union.bed" + >>> + + output { + File merged_sites = "genotyping.sites.union.bed" + } + + ################### + runtime { + cpu: 2 + memory: "4 GiB" + disks: "local-disk 50 HDD" + bootDiskSizeGb: 10 + preemptible_tries: 3 + max_retries: 2 + docker:"gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task ExtractRelevantGenotypingReads { + meta { + description: "Based on genotyping (SNP) sites, extract reads that overlap those places" + } + input { + File aligned_bam + File aligned_bai + + File genotyping_sites_bed + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + aligned_bam:{ + localization_optional: true + } + } + + command <<< + + set -eux + + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + + samtools view -h -@ 1 \ + --write-index \ + -o "relevant_reads.bam##idx##relevant_reads.bam.bai" \ + -M -L ~{genotyping_sites_bed} \ + ~{aligned_bam} + >>> + + output { + File relevant_reads = "relevant_reads.bam" + File relevant_reads_bai = "relevant_reads.bam.bai" + } + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: 375, # will use LOCAL SSD for speeding things up + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ResetCLRBaseQual { + input { + File bam + File bai + + Int arbitrary_bq + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 100 + 2*ceil(size(bam, "GB")) + + String prefix = "barbequed" + + command <<< + set -eux + + python /usr/local/bin/reset_clr_bam_bq.py \ + -q ~{arbitrary_bq} \ + -p ~{prefix} \ + ~{bam} + rm -f "~{prefix}.bai" "~{prefix}.bam.bai" + samtools index "~{prefix}.bam" + >>> + + output { + File barbequed_bam = "~{prefix}.bam" + File barbequed_bai = "~{prefix}.bam.bai" + } + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.34" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + +} + +task CheckFingerprint { + + meta { + description: "Uses Picard tool CheckFingerprint to verify if the samples in provided VCF and BAM arise from the same biological sample" + } + input { + File aligned_bam + File aligned_bai + + File fingerprint_vcf + String vcf_sample_name + + File haplotype_map + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + aligned_bam:{ + description: "GCS path to aligned BAM file, supposed to be of the same sample as from the fingerprinting VCF", + localization_optional: true + } + + fingerprint_vcf: "Fingerprint VCF file from local database; note that sample name must be the same as in BAM" + vcf_sample_name: "Sample name in VCF, possibly different from that in the BAM." + haplotype_map: "Happlotype map file for the reference build used. See https://bit.ly/3QyZbwt" + } + + Int disk_size = ceil(size([fingerprint_vcf, haplotype_map], "GB")) + String prefix = basename(aligned_bam, ".bam") + + command <<< + set -eux + + gatk CheckFingerprint \ + --INPUT ~{aligned_bam} \ + --GENOTYPES ~{fingerprint_vcf} \ + --EXPECTED_SAMPLE_ALIAS ~{vcf_sample_name} \ + --HAPLOTYPE_MAP ~{haplotype_map} \ + --OUTPUT ~{prefix} + + grep -v '^#' ~{prefix}.fingerprinting_summary_metrics | \ + grep -A1 READ_GROUP | \ + awk ' + { + for (i=1; i<=NF; i++) { + a[NR,i] = $i + } + } + NF>p { p = NF } + END { + for(j=1; j<=p; j++) { + str=a[1,j] + for(i=2; i<=NR; i++){ + str=str" "a[i,j]; + } + print str + } + }' | \ + sed 's/ /\t/' \ + > metrics_map.txt + + mv ~{prefix}.fingerprinting_summary_metrics \ + ~{prefix}.fingerprinting_summary_metrics.txt + mv ~{prefix}.fingerprinting_detail_metrics \ + ~{prefix}.fingerprinting_detail_metrics.txt + >>> + + output { + File summary_metrics = "~{prefix}.fingerprinting_summary_metrics.txt" + File detail_metrics = "~{prefix}.fingerprinting_detail_metrics.txt" + Map[String, String] metrics_map = read_map("metrics_map.txt") + } + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-gatk/gatk:4.2.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CheckCLRFingerprint { + + meta { + description: "Uses Picard tool CheckFingerprint to verify if the samples in provided VCF and the CLR BAM arise from the same biological sample." + } + input { + File aligned_bam + File aligned_bai + Int min_base_q = 0 + + File fingerprint_vcf + String vcf_sample_name + + File haplotype_map + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcf_sample_name: "Sample name in VCF, possibly different from that in the BAM." + haplotype_map: "Happlotype map file for the reference build used. See https://bit.ly/3QyZbwt" + } + + Int disk_size = 100 + ceil(size(aligned_bam, "GB")) + String prefix = basename(aligned_bam, ".bam") + + command <<< + set -eux + + java -jar /usr/picard/picard.jar \ + CheckFingerprint \ + INPUT=~{aligned_bam} \ + GENOTYPES=~{fingerprint_vcf} \ + EXPECTED_SAMPLE_ALIAS=~{vcf_sample_name} \ + HAPLOTYPE_MAP=~{haplotype_map} \ + OUTPUT=~{prefix} \ + MIN_BASE_QUAL=~{min_base_q} + + grep -v '^#' ~{prefix}.fingerprinting_summary_metrics | \ + grep -A1 READ_GROUP | \ + awk ' + { + for (i=1; i<=NF; i++) { + a[NR,i] = $i + } + } + NF>p { p = NF } + END { + for(j=1; j<=p; j++) { + str=a[1,j] + for(i=2; i<=NR; i++){ + str=str" "a[i,j]; + } + print str + } + }' | \ + sed 's/ /\t/' \ + > metrics_map.txt + + mv ~{prefix}.fingerprinting_summary_metrics \ + ~{prefix}.fingerprinting_summary_metrics.txt + mv ~{prefix}.fingerprinting_detail_metrics \ + ~{prefix}.fingerprinting_detail_metrics.txt + >>> + + output { + File summary_metrics = "~{prefix}.fingerprinting_summary_metrics.txt" + File detail_metrics = "~{prefix}.fingerprinting_detail_metrics.txt" + Map[String, String] metrics_map = read_map("metrics_map.txt") + } + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/picard:lrfp-clr" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ReheaderFullGRCh38VCFtoNoAlt { + meta { + desciption: + "Reheader the fingperint VCF that's generated with full GRCh38 reference to the no_alt header; project specific." + } + + input { + File full_GRCh38_vcf + } + + command <<< + set -eux + + GREPCMD="grep" + if [[ ~{full_GRCh38_vcf} =~ \.gz$ ]]; then + GREPCMD="zgrep" + fi + "${GREPCMD}" -vF "_decoy,length=" ~{full_GRCh38_vcf} | \ + grep -vF "_alt,length=" | \ + grep -v "^##contig= "reheadered.fp.vcf" + >>> + + output { + File reheadered_vcf = "reheadered.fp.vcf" + } + + runtime { + disks: "local-disk 100 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} diff --git a/wdl/lib/QC/Quast.wdl b/wdl/lib/QC/Quast.wdl new file mode 100644 index 0000000..d7c349b --- /dev/null +++ b/wdl/lib/QC/Quast.wdl @@ -0,0 +1,112 @@ +version 1.0 + +########################################################################################## +# A task that runs QUAST to evaluate a given set of assemblies +# on a species with existing reference assembly. +# - Entire Quast output will be tarballed +########################################################################################## + +import "../../structs/Structs.wdl" + +task Quast { + input { + File? ref + Array[File] assemblies + Boolean is_large = false + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + ref: "reference assembly of the species" + assemblies: "list of assemblies to evaluate" + } + + Int minimal_disk_size = 2*(ceil(size(ref, "GB") + size(assemblies, "GB"))) + Int disk_size = if minimal_disk_size > 100 then minimal_disk_size else 100 + + String size_optimization = if is_large then "--large" else " " + + command <<< + set -eux + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + quast --no-icarus \ + "~{size_optimization}" \ + --threads "${num_core}" \ + ~{true='-r' false='' defined(ref)} \ + ~{select_first([ref, ""])} \ + ~{sep=' ' assemblies} + + tree -h quast_results/ + + if [[ -d quast_results/contigs_reports ]]; then + tar -zcvf contigs_reports.tar.gz quast_results/contigs_reports + fi + >>> + + output { + File report_txt = "quast_results/latest/report.txt" + File report_html = "quast_results/latest/report.html" + + Array[File] report_in_various_formats = glob("quast_results/latest/report.*") + + Array[File] plots = glob("quast_results/latest/basic_stats/*.pdf") + + File? contigs_reports = "contigs_reports.tar.gz" + } + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 80, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-quast:5.2.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SummarizeQuastReport { + input { + File quast_report_txt + } + + command <<< + set -eux + grep -v -e '^All statistics' -e '^$' ~{quast_report_txt} | \ + sed 's/ /_/g' | \ + sed 's/__\+/\t/g' | \ + sed 's/\s\+$//g' | \ + sed 's/>=/gt/g' | \ + tee report_map.txt + + for i in $(seq 2 $(awk '{print NF}' report_map.txt | sort -nu | tail -n 1)) + do + j=$(( i - 2 )) # to make sure the primary, assuming it's the 0-th fed in to this task and the left-most value column + cut -d$'\t' -f1,${i} < report_map.txt > report_map_${j}.txt + done + >>> + + output { + File quast_metrics_together = "report_map.txt" + Array[File] quast_metrics = glob("report_map_*.txt") + } + + runtime { + disks: "local-disk 100 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} diff --git a/wdl/lib/QC/SampleLevelAlignedMetrics.wdl b/wdl/lib/QC/SampleLevelAlignedMetrics.wdl new file mode 100644 index 0000000..f50f00f --- /dev/null +++ b/wdl/lib/QC/SampleLevelAlignedMetrics.wdl @@ -0,0 +1,103 @@ +version 1.0 + +import "../Utility/Utils.wdl" +import "../Visualization/NanoPlot.wdl" as NP +import "../QC/AlignedMetrics.wdl" as AM + + +workflow SampleLevelAlignedMetrics { + + meta { + description: "A utility (sub-)workflow to compute coverage on sample-leve BAM, and optionally over a provided BED file" + } + + input { + File aligned_bam + File aligned_bai + + File ref_fasta + + File? bed_to_compute_coverage + } + + call Utils.ComputeGenomeLength { input: fasta = ref_fasta } + call NP.NanoPlotFromBam { input: bam = aligned_bam, bai = aligned_bai } + + if (defined(bed_to_compute_coverage)) { + call AM.MosDepthOverBed { + input: + bam = aligned_bam, + bai = aligned_bai, + bed = select_first([bed_to_compute_coverage]) + } + + call SummarizeDepthOverWholeBed as cov_over_region { + input: + mosdepth_output = MosDepthOverBed.regions + } + } + + output { + + File? bed_cov_summary = cov_over_region.cov_summary + + Float aligned_num_reads = NanoPlotFromBam.stats_map['number_of_reads'] + Float aligned_num_bases = NanoPlotFromBam.stats_map['number_of_bases_aligned'] + Float aligned_frac_bases = NanoPlotFromBam.stats_map['fraction_bases_aligned'] + Float aligned_est_fold_cov = NanoPlotFromBam.stats_map['number_of_bases_aligned']/ComputeGenomeLength.length + + Float aligned_read_length_mean = NanoPlotFromBam.stats_map['mean_read_length'] + Float aligned_read_length_median = NanoPlotFromBam.stats_map['median_read_length'] + Float aligned_read_length_stdev = NanoPlotFromBam.stats_map['read_length_stdev'] + Float aligned_read_length_N50 = NanoPlotFromBam.stats_map['n50'] + + Float average_identity = NanoPlotFromBam.stats_map['average_identity'] + Float median_identity = NanoPlotFromBam.stats_map['median_identity'] + + Map[String, Float] reads_stats = NanoPlotFromBam.stats_map + } +} + +task SummarizeDepthOverWholeBed { + input { + File mosdepth_output + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(mosdepth_output, "GB")) + + String prefix = sub(basename(mosdepth_output, ".regions.bed.gz"), "out.coverage.", "") + + command <<< + set -euxo pipefail + + echo 'chr start stop gene cov_mean' | awk 'BEGIN {OFS="\t"} {print}' > ~{prefix}.summary.txt + zcat ~{mosdepth_output} >> ~{prefix}.summary.txt + >>> + + output { + File cov_summary = "~{prefix}.summary.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Transcriptomics/MASSeq.wdl b/wdl/lib/Transcriptomics/MASSeq.wdl new file mode 100644 index 0000000..c31daf9 --- /dev/null +++ b/wdl/lib/Transcriptomics/MASSeq.wdl @@ -0,0 +1,305 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task RemoveMasSeqTruncatedReads { + input { + File bam + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 1 + ceil(2 * size(bam, "GiB")) + + command <<< + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + /python_scripts/remove_mas_seq_trucated_reads.py ~{bam} ~{prefix}.non_truncated + samtools index -@$np ~{prefix}.non_truncated.bam + >>> + + output { + File non_trucated_bam = "~{prefix}.non_truncated.bam" + File non_trucated_bai = "~{prefix}.non_truncated.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 2, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task AdjustUmiSequenceWithAdapterAlignment { + # TODO: Move this into Longbow - both the WDL and the code itself. + meta { + description : "Extracts a new UMI from each given read by aligning the preceding adapter sequences to the read." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + input { + File bam + String prefix = "out" + + Int umi_length = 10 + String existing_umi_tag = "ZU" + String new_umi_tag = "JX" + + String? pre_pre_umi_seq + + String? pre_umi_seq + String? pre_umi_tag + + String? post_umi_seq + String? post_umi_tag + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 10 + 3*ceil(size(bam, "GB")) + + # NOTE: Mutual exclusivity of arguments is coded in the script itself. + String pre_pre_umi_seq_arg = if defined(pre_pre_umi_seq) then " --pre-pre-umi-seq " else "" + + String pre_umi_seq_arg = if defined(pre_umi_seq) then " --pre-umi-seq " else "" + String pre_umi_tag_arg = if defined(pre_umi_tag) then " --pre-umi-tag " else "" + + String post_umi_seq_arg = if defined(post_umi_seq) then " --post-umi-seq " else "" + String post_umi_tag_arg = if defined(post_umi_tag) then " --post-umi-tag " else "" + + command { + set -e + + python3 /lrma/update_umi_positions_2.py \ + -b ~{bam} \ + -s /dev/null \ + --umi-length ~{umi_length} \ + --existing-umi-tag ~{existing_umi_tag} \ + --new-umi-tag ~{new_umi_tag} \ + ~{pre_pre_umi_seq_arg}~{default="" pre_pre_umi_seq} \ + ~{pre_umi_seq_arg}~{default="" pre_umi_seq} \ + ~{pre_umi_tag_arg}~{default="" pre_umi_tag} \ + ~{post_umi_seq_arg}~{default="" post_umi_seq} \ + ~{post_umi_tag_arg}~{default="" post_umi_tag} \ + -o ~{prefix}.umi_adjusted.bam | tee ~{prefix}.log + } + + output { + File umi_adjusted_bam = "~{prefix}.umi_adjusted.bam" + File log = "~{prefix}.log" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, # Decent amount of CPU and Memory because network transfer speed is proportional to VM "power" + mem_gb: 16, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 0, # This shouldn't take very long, but it's nice to have things done quickly, so no preemption here. + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-10x:0.1.18" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterMasSeqReads { + input { + File input_bam + File input_bai + + Int maxReadLength = 15000 + Int maxEndClipping = 1000 + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 1 + ceil(2 * size(input_bam, "GiB")) + ceil(size(input_bai, "GiB")) + + command { + /gatk/gatk PrintReads \ + -I ~{input_bam} \ + -O ~{prefix}.bam \ + --disable-read-filter WellformedReadFilter \ + --read-filter MappedReadFilter \ + --read-filter MappingQualityNotZeroReadFilter \ + --read-filter NotSecondaryAlignmentReadFilter \ + --read-filter NotSupplementaryAlignmentReadFilter \ + --read-filter ReadLengthReadFilter --max-read-length 15000 \ + --read-filter ExcessiveEndClippedReadFilter --max-clipped-bases 1000 + + echo "PWD is:" + pwd + + echo "PWD List:" + ls -la + + echo "Outfile list:" + ls -la ~{prefix}.bam* + + date + } + + output { + File bam = "~{prefix}.bam" + File bai = "~{prefix}.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 2, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "broadinstitute/gatk:4.2.6.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task RenameSingleCellBamTagsForMasIsoSeqV0 { + meta { + description : "Rename the single-cell tags in MAS-ISO-seq v0 data (CB -> Jp; ZU -> Jq ...)." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + + String prefix = "tags_renamed" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 10 + 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + time /python_scripts/rename_single_cell_bam_tags.py \ + ~{bam} \ + ~{prefix}.bam + + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + samtools index -@${np} ~{prefix}.bam + >>> + + output { + File bam_out = "~{prefix}.bam" + File bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task RestoreSingleCellBamTagsForMasIsoSeqV0 { + meta { + description : "Restore the single-cell tags in MAS-ISO-seq v0 data (Jp -> Cb; Jq -> ZU ...)." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + + String prefix = "tags_restored" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 10 + 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + time /python_scripts/restore_single_cell_bam_tags.py \ + ~{bam} \ + ~{prefix}.bam + + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + samtools index -@${np} ~{prefix}.bam + >>> + + output { + File bam_out = "~{prefix}.bam" + File bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/lib/Transcriptomics/Postprocessing_Tasks.wdl b/wdl/lib/Transcriptomics/Postprocessing_Tasks.wdl new file mode 100644 index 0000000..95fc7c7 --- /dev/null +++ b/wdl/lib/Transcriptomics/Postprocessing_Tasks.wdl @@ -0,0 +1,668 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task CreateCountMatrixFromAnnotatedBam { + + meta { + description : "Creates a count matrix TSV file from the given annotated bam file. Bam file must contain tags that indicate the gene/transcript (XG), cell barcode (CB), and umi (BX) of the read." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File annotated_transcriptome_bam + + File? tx_equivalence_class_assignments + + String prefix = "umi_tools_group" + + String umi_tag = "ZU" + + RuntimeAttr? runtime_attr_override + } + + String tx_eq_class_assignments_arg = if defined(tx_equivalence_class_assignments) then " --tx-eq-class-assignments " else "" + + Int disk_size_gb = 20 + 11*ceil(size(annotated_transcriptome_bam, "GB")) + + 2*ceil(size(tx_equivalence_class_assignments, "GB")) + + command <<< + set -euxo pipefail + /python_scripts/create_count_matrix_from_annotated_bam.py \ + -b ~{annotated_transcriptome_bam} \ + ~{tx_eq_class_assignments_arg} ~{default="" tx_equivalence_class_assignments} \ + --umi-tag ~{umi_tag} \ + -o ~{prefix}.tsv + >>> + + output { + File count_matrix = "~{prefix}.tsv" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 32, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task AggregateUmiAdjustmentStats +{ + + # TODO: FINISHME + input { + Array[File] longbow_umi_adjustment_log_files + + String out_name = "longbow_umi_adjustment_stats.txt" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(longbow_umi_adjustment_log_files, "GB")) + + # YES, this SHOULD be a proper tool, but right now it isn't. + command <<< + + f=~{write_lines(longbow_umi_adjustment_log_files)} + + mv $f THE_UMI_FILE_LIST.txt + +python << CODE +import os + +stats_dict = dict() +line_key = "STATS: " + +with open("THE_UMI_FILE_LIST.txt", 'r') as umi_file_list_file: + for line in umi_file_list_file: + stats_file = line.strip() + with open(stats_file, 'r') as f: + for line in f: + if line_key in line: + line = line.strip() + s = line[line.find(line_key) + len(line_key):] + key, remainder = [t.strip() for t in s.split(":")] + if "/" in remainder: + count = int(remainder[:remainder.find("/")]) + tot = int(remainder[remainder.find("/")+1:remainder.find(" ")]) + else: + count = int(remainder) + tot = None + + try: + c, t = stats_dict[key] + if tot is not None: + tot += t + stats_dict[key] = (count + c, tot) + except KeyError: + stats_dict[key] = (count, tot) + +k_len = 0 +for k in stats_dict.keys(): + if len(k) > k_len: + k_len = len(k) + +k_prefix = list(stats_dict.keys())[0] +k_prefix = k_prefix[:k_prefix.find(" ")] +with open("~{out_name}", 'w') as f: + for k, v in stats_dict.items(): + + if not k.startswith(k_prefix): + f.write("\n") + k_prefix = k[:k.find(" ")] + + k_spacing = k_len - len(k) + + count, tot = v + if tot is None: + f.write(f"{k}:{' '*k_spacing} {count}\n") + else: + f.write(f"{k}:{' '*k_spacing} {count}/{tot} ({100.0*count/tot:2.4f}%)\n") + +CODE + >>> + + output { + File stats = out_name + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, # Decent amount of CPU and Memory because network transfer speed is proportional to VM "power" + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, # This shouldn't take very long, but it's nice to have things done quickly, so no preemption here. + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-longbow:0.5.27" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeBarcodeCounts { + meta { + description : "Merge all counts for each unique barcode in the given TSV file. Assumes file is unheadered and have two columns: BARCODE COUNT. Merging performed by adding all COUNTs for each BARCODE." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File barcode_count_tsv + String prefix = "merged_counts" + + RuntimeAttr? runtime_attr_override + } + + # 20 gb - baseline storage for safety + # 1x for the file itself + # 2x for the results and some wiggle room. + Int disk_size_gb = 20 + (3 * ceil(size(barcode_count_tsv, "GB"))) + + command { + /python_scripts/merge_barcode_counts.py ~{barcode_count_tsv} + if [[ "~{prefix}.tsv" != "merged_counts.tsv" ]] ; then + mv merged_counts.tsv "~{prefix}.tsv" + fi + } + + output { + File merged_counts = "~{prefix}.tsv" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 16, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +task CreateCountMatrixAnndataFromTsv { + + meta { + description : "Creates a python anndata object from the given countmatrix tsv. Expects the input to have been generated by CreateCountMatrixFromAnnotatedBam. The resulting anndata object can be directly read into scanpy for single-cell analysis." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File count_matrix_tsv + File genome_annotation_gtf_file + + Boolean force_anndata_gencode_overwrite = false + + String prefix = "umi_tools_group" + + File? equivalence_class_definitions + + File? overlap_intervals + String? overlap_interval_label + File? gencode_reference_gtf_file + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 20 + 4*ceil(size(count_matrix_tsv, "GB")) + 4*ceil(size(genome_annotation_gtf_file, "GB")) + 2*ceil(size(equivalence_class_definitions, "GB")) + + String overlap_intervals_arg = if defined(overlap_intervals) then " --overlap-intervals " else "" + String overlap_interval_label_arg = if defined(overlap_interval_label) then " --overlap-interval-label " else "" + String gencode_reference_gtf_file_arg = if defined(gencode_reference_gtf_file) then " --gencode-reference-gtf " else "" + + String force_gencode_overwrite_flag = if force_anndata_gencode_overwrite then " --force-overwrite-gencode-overlaps " else "" + + String eq_class_arg = if defined(equivalence_class_definitions) then " --eq-class-defs-tsv " else "" + + command <<< + set -euxo pipefail + /python_scripts/create_count_matrix_anndata_from_tsv.py \ + -t ~{count_matrix_tsv} \ + -g ~{genome_annotation_gtf_file} \ + ~{overlap_intervals_arg}~{default="" overlap_intervals} \ + ~{overlap_interval_label_arg}~{default="" overlap_interval_label} \ + ~{gencode_reference_gtf_file_arg}~{default="" gencode_reference_gtf_file} \ + ~{eq_class_arg} ~{default="" equivalence_class_definitions} \ + ~{force_gencode_overwrite_flag} \ + -o ~{prefix} + >>> + + output { + File transcript_gene_count_anndata_h5ad = "~{prefix}_tx_gene_counts_adata.h5ad" + Array[File] pickles = glob("*.pickle") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 32, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CreateCountMatrixAnndataFromEquivalenceClasses { + + meta { + description : "Creates a python anndata object from the given countmatrix tsv and equivalence classes. Expects the input to have been generated by CreateCountMatrixFromAnnotatedBam. The resulting anndata object can be directly read into scanpy for single-cell analysis." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File count_matrix_tsv + File genome_annotation_gtf_file + + File tx_equivalence_class_definitions + File tx_equivalence_class_assignments + File gene_equivalence_class_definitions + File gene_equivalence_class_assignments + + Boolean force_anndata_gencode_overwrite = false + + String prefix = "umi_tools_group" + + File? overlap_intervals + String? overlap_interval_label + File? gencode_reference_gtf_file + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 20 + 4*ceil(size(count_matrix_tsv, "GB")) + + 4*ceil(size(genome_annotation_gtf_file, "GB")) + + 2*ceil(size(tx_equivalence_class_definitions, "GB")) + + 2*ceil(size(tx_equivalence_class_assignments, "GB")) + + 2*ceil(size(gene_equivalence_class_definitions, "GB")) + + 2*ceil(size(gene_equivalence_class_assignments, "GB")) + + 2*ceil(size(gencode_reference_gtf_file, "GB")) + + 2*ceil(size(overlap_intervals, "GB")) + + String overlap_intervals_arg = if defined(overlap_intervals) then " --overlap-intervals " else "" + String overlap_interval_label_arg = if defined(overlap_interval_label) then " --overlap-interval-label " else "" + String gencode_reference_gtf_file_arg = if defined(gencode_reference_gtf_file) then " --gencode-reference-gtf " else "" + + String force_gencode_overwrite_flag = if force_anndata_gencode_overwrite then " --force-overwrite-gencode-overlaps " else "" + + command <<< + set -euxo pipefail + /python_scripts/create_count_matrix_anndata_from_equivalence_classes.py \ + -t ~{count_matrix_tsv} \ + -g ~{genome_annotation_gtf_file} \ + --tx-eq-class-definitions ~{tx_equivalence_class_definitions} \ + --tx-eq-class-assignments ~{tx_equivalence_class_assignments} \ + --gene-eq-class-definitions ~{gene_equivalence_class_definitions} \ + --gene-eq-class-assignments ~{gene_equivalence_class_definitions} \ + ~{overlap_intervals_arg}~{default="" overlap_intervals} \ + ~{overlap_interval_label_arg}~{default="" overlap_interval_label} \ + ~{gencode_reference_gtf_file_arg}~{default="" gencode_reference_gtf_file} \ + ~{force_gencode_overwrite_flag} \ + -o ~{prefix} + >>> + + output { + File transcript_gene_count_anndata_h5ad = "~{prefix}_tx_gene_counts_adata.h5ad" + Array[File] pickles = glob("*.pickle") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 32, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SubsetCountsMatrixByGenes { + + meta { + description : "Subsets a count matrix TSV file to contain only the transcripts from the given list of genes. Assumes the count matrix was produced by comparison with Gencode (due to data formatting) and that the table is a TSV with samples as rows and transcripts as columns." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File count_matrix_tsv + Array[String] gene_names + } + + parameter_meta { + count_matrix_tsv : "TSV file containing the counts of each transcript expressed in a sample. (Transcripts in columns. Samples in rows. One header row.)" + gene_names : "Array of gene names for which to keep data from the given count matrix TSV." + } + + # We're subsetting the file, so we should be able to get away with very little space here. + # 1x for the file itself + # 2x for the results and some wiggle room. + Int disk_size = 3 * ceil(size(count_matrix_tsv, "GB")) + + command { + /python_scripts/subset_count_matrix_by_gene.py ~{count_matrix_tsv} ~{sep=' ' gene_names} + } + + output { + File subset_count_matrix_tsv = "count_matrix_subset_by_gene.tsv" + File subset_count_matrix_h5ad = "count_matrix_subset_by_gene.h5ad" + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + memory: 16 + " GiB" + disks: "local-disk " + disk_size + " HDD" + boot_disk_gb: 10 + preemptible: 0 + cpu: 8 + } +} + + +task QuantifyGffComparison { + meta { + description : "Create equivalence classes and gene assignments from a set of gffcompare results." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File genome_gtf + + File st2_gencode_refmap + File st2_gencode_tmap + File st2_read_refmap + File st2_read_tmap + File gencode_st2_refmap + File gencode_st2_tmap + File gencode_read_refmap + File gencode_read_tmap + + String prefix = "reads_comparison" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + genome_gtf : "Genome annotation GTF file (usually gencode)." + st2_gencode_refmap : "Refmap file (produced by gffcompare) comparing the stringtie2 discovered transcriptome to the genome reference gtf." + st2_gencode_tmap : "Tmap file (produced by gffcompare) comparing the stringtie2 discovered transcriptome to the genome reference gtf." + st2_read_refmap : "Refmap file (produced by gffcompare) comparing the stringtie2 discovered transcriptome to input reads (in GFF format)." + st2_read_tmap : "Tmap file (produced by gffcompare) comparing the stringtie2 discovered transcriptome to input reads (in GFF format)." + gencode_st2_refmap : "Refmap file (produced by gffcompare) comparing the genome reference gtf to the stringtie2 discovered transcriptome." + gencode_st2_tmap : "Tmap file (produced by gffcompare) comparing the genome reference gtf to the stringtie2 discovered transcriptome." + gencode_read_refmap : "Refmap file (produced by gffcompare) comparing the genome reference gtf to input reads (in GFF format)." + gencode_read_tmap : "Tmap file (produced by gffcompare) comparing the genome reference gtf to input reads (in GFF format)." + prefix : "Prefix for ouput file." + } + + Int disk_size_gb = 10 + 2*ceil(size(genome_gtf, "GB")) + + 2*ceil(size(st2_gencode_refmap, "GB")) + + 2*ceil(size(st2_gencode_tmap, "GB")) + + 2*ceil(size(st2_read_refmap, "GB")) + + 2*ceil(size(st2_read_tmap, "GB")) + + 2*ceil(size(gencode_st2_refmap, "GB")) + + 2*ceil(size(gencode_st2_tmap, "GB")) + + 2*ceil(size(gencode_read_refmap, "GB")) + + 2*ceil(size(gencode_read_tmap, "GB")) + + command <<< + time /python_scripts/quantify_gff_reads.py \ + --gencode_gtf ~{genome_gtf} \ + --st2-gencode-refmap ~{st2_gencode_refmap} \ + --st2-mas-seq-refmap ~{st2_read_refmap} \ + --gencode-st2-refmap ~{gencode_st2_refmap} \ + --gencode-mas-seq-refmap ~{gencode_read_refmap} \ + --st2-gencode-tmap ~{st2_gencode_tmap} \ + --st2-mas-seq-tmap ~{st2_read_tmap} \ + --gencode-st2-tmap ~{gencode_st2_tmap} \ + --gencode-mas-seq-tmap ~{gencode_read_tmap} \ + -o ~{prefix} + + # Here so the task changes and we don't get cached: + echo "" + echo "" + >>> + + output { + File gene_eq_class_labels_file = prefix + ".gene_equivalence_class_lookup.tsv" + File gene_assignments_file = prefix + ".gene_name_assignments.tsv" + File tx_equivalence_class_labels_file = prefix + ".equivalence_class_lookup.tsv" + File tx_equivalence_class_file = prefix + ".equivalence_classes.tsv" + File graph_gpickle = prefix + ".graph.gpickle" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 32, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CombineEqClassFiles { + meta { + description : "Combine equivalence classes and gene assignments from disjoint sets of reads produced by QuantifyGffComparison." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + Array[File] gene_eq_class_definitions + Array[File] gene_assignment_files + + Array[File] equivalence_class_definitions + Array[File] equivalence_classes + + String prefix = "combined" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gene_eq_class_definitions : "TSV files containing equivalence class definitions for genes as produced by QuantifyGffComparison.gene_eq_class_labels_file." + gene_assignment_files : "TSV files containing read -> gene equivalence class assignments as produced by QuantifyGffComparison.gene_assignments_file." + equivalence_class_definitions : "TSV files containing transcript equivalence class definitions as produced by QuantifyGffComparison.tx_equivalence_class_labels_file." + equivalence_classes : "TSV files containing read -> transcript equivalence class assignments as produced by QuantifyGffComparison.tx_equivalence_class_file." + prefix : "Prefix for ouput file." + } + + Int disk_size_gb = 10 + 2*ceil(size(equivalence_class_definitions, "GB")) + + 2*ceil(size(equivalence_classes, "GB")) + + 2*ceil(size(gene_eq_class_definitions, "GB")) + + 2*ceil(size(gene_assignment_files, "GB")) + + command <<< + time /python_scripts/combine_tx_equivalence_classes.py \ + ~{sep=" " gene_eq_class_definitions} \ + ~{sep=" " gene_assignment_files} \ + ~{sep=" " equivalence_class_definitions} \ + ~{sep=" " equivalence_classes} + + mv gene_equivalence_class_lookup.tsv ~{prefix}.gene_equivalence_class_lookup.tsv + mv gene_name_assignments.tsv ~{prefix}.gene_name_assignments.tsv + mv equivalence_class_lookup.tsv ~{prefix}.equivalence_class_lookup.tsv + mv equivalence_classes.tsv ~{prefix}.equivalence_classes.tsv + >>> + + output { + File combined_gene_eq_class_defs = "~{prefix}.gene_equivalence_class_lookup.tsv" + File combined_gene_eq_class_assignments = "~{prefix}.gene_name_assignments.tsv" + File combined_tx_eq_class_defs = "~{prefix}.equivalence_class_lookup.tsv" + File combined_tx_eq_class_assignments = "~{prefix}.equivalence_classes.tsv" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 16, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CopyEqClassInfoToTag { + meta { + description : "Copy the gene assignment for each given read into the given tag for each read." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + File eq_class_file + + String gene_tag = "XG" + String eq_class_tag = "eq" + + String prefix = "combined" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 10 + 2*ceil(size(bam, "GB")) + + 4*ceil(size(eq_class_file, "GB")) + + # TODO: Extract this code into its own module / file. + command <<< + set -euxo pipefail + +python << CODE +import pysam +from tqdm import tqdm + +with open(f"~{eq_class_file}", 'r') as f: + read_gene_dict = dict() + for line in tqdm(f): + if line.startswith("#"): + continue + read_name, tx_eq_class, gene_assignment = line.strip().split("\t") + read_gene_dict[read_name] = (tx_eq_class, gene_assignment) + +with pysam.AlignmentFile(f"~{bam}", "rb", check_sq=False, require_index=False) as bam_file: + with pysam.AlignmentFile(f"~{prefix}.bam", "wb", header=bam_file.header) as out_bam_file: + for read in tqdm(bam_file): + read.set_tag(f"~{eq_class_tag}", read_gene_dict[read.query_name][0]) + read.set_tag(f"~{gene_tag}", read_gene_dict[read.query_name][1]) + out_bam_file.write(read) +CODE + + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + samtools index -@${np} ~{prefix}.bam + >>> + + output { + File bam_out = "~{prefix}.bam" + File bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 32, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Transcriptomics/Preprocessing_Tasks.wdl b/wdl/lib/Transcriptomics/Preprocessing_Tasks.wdl new file mode 100644 index 0000000..8850e8b --- /dev/null +++ b/wdl/lib/Transcriptomics/Preprocessing_Tasks.wdl @@ -0,0 +1,542 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task SplitBamBySampleAndCellBarcodeTask { + + meta { + description : "Convert a single annotated (via the 10x tool), aligned bam file into individual FASTA files named by sample name and cell barcode. Also produces a manifest file for FLAIR to easily quantify output." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File aligned_annotated_bam + String output_base_name = "reads" + } + + parameter_meta { + aligned_annotated_bam : "Bam file containing aligned reads that have been annotated with the 10x tool." + output_base_name : "[optional] base name to give to every output file. Should correspond to some unique identifier from this dataset." + } + + # 10x the total size of the input bam (uncompressed reads) + # 1x for the file itself + # 1x for wiggle-room + # 2x for tar/gz-ing the output: + Int disk_size = ((10+1+1)*2) * ceil(size(aligned_annotated_bam, "GB")) + + String fasta_tar_gz_name = "fasta_files_by_sample_and_barcode.tar.gz" + + command { + /python_scripts/split_annotated_reads_by_sample_and_cell_barcode.py -b ~{aligned_annotated_bam} -o ~{output_base_name} + tar -zcf ~{fasta_tar_gz_name} *.fasta + } + + output { + File flair_manifest = "${output_base_name}_flair_reads_manifest.tsv" + Array[File] sample_cell_barcode_fasta_files = glob("*.fasta") + File fasta_tar_gz_out = "~{fasta_tar_gz_name}" + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + memory: 16 + " GiB" + disks: "local-disk " + disk_size + " HDD" + boot_disk_gb: 10 + preemptible: 0 + cpu: 8 + } +} + +task DownsampleToIsoSeqEquivalent { + + meta { + description : "Downsample a given MAS-seq array element bam file into one containing only 1 read per ZMW (equivalent to IsoSeq)." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File array_element_bam + String prefix = "downsampled_masseq" + } + + parameter_meta { + array_element_bam : "Bam file containing aligned reads that have been annotated with the 10x tool." + prefix : "[optional] base name to give to every output file. Should correspond to some unique identifier from this dataset." + } + + Int disk_size = 10 + 20 * ceil(size(array_element_bam, "GB")) + + String out_name = basename(array_element_bam, ".bam") + ".ZMW_downsampled.bam" + + command { + /python_scripts/downsample_masseq_by_zmw.py ~{array_element_bam} + + # TODO: THIS IS A HACK - FIX IT LATER + mv ~{out_name} ~{prefix}.bam + } + + output { + File downsampled_bam = "${prefix}.bam" + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + memory: 32 + " GiB" # Need a lot of ram here because we keep a set of ZMWs in memory + disks: "local-disk " + disk_size + " HDD" + boot_disk_gb: 10 + preemptible: 0 + cpu: 2 + } +} + +task DemuxMasSeqDataByIndex { + + meta { + description : "This workflow will split MAS-seq data that is indexed with a 10bp sequence at the 3' end." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File array_bam + } + + parameter_meta { + array_bam : "Bam file containing annotated MAS-seq array reads that contain a 10bp index near the 3' end.." + } + + Int disk_size = 10 + 20 * ceil(size(array_bam, "GB")) + + String base_out_name = basename(array_bam, ".bam") + + command { + /python_scripts/mas_seq_demux_by_index.py ~{array_bam} 1> demux_by_index.log + } + + output { + File demux_i1 = base_out_name + ".demux_i1.bam" + File demux_i2 = base_out_name + ".demux_i2.bam" + File demux_i3 = base_out_name + ".demux_i3.bam" + File demux_i4 = base_out_name + ".demux_i4.bam" + File demux_ambiguous = base_out_name + ".demux_ambiguous_indices.bam" + File log_file = "demux_by_index.log" + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + memory: 4 + " GiB" + disks: "local-disk " + disk_size + " HDD" + boot_disk_gb: 10 + preemptible: 0 + cpu: 2 + } +} + +task MergeDemuxMasSeqByIndexLogs { + + meta { + description : "This workflow will merge logs from the DemuxMasSeqDataByIndex task." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + Array[File] demux_logs + } + + parameter_meta { + demux_logs : "Log files from DemuxMasSeqDataByIndex task." + } + + Int disk_size = 10 + 20 * ceil(size(demux_logs, "GB")) + + String out_log_name = "merged_demux_log.log" + + command <<< + + OUT_LOG="~{out_log_name}" + + t="t.txt" + a="a.txt" + ap="ap.txt" + i1="i1.txt" + i2="i2.txt" + i3="i3.txt" + i4="i4.txt" + tt="tt.txt" + tpr="tpr.txt" + trps="trps.txt" + + rm -f $OUT_LOG + + for f in ~{sep=' ' demux_logs} ; do + + grep "^Ambiguous read:" $f >> $OUT_LOG + + # Get the data from the log file: + tail -n 11 $f | head -n1 | awk '{print $NF}' >> $t + tail -n 10 $f | head -n1 | awk '{print $NF}' >> $a + tail -n 9 $f | head -n1 | awk '{print $NF}' | tr -d '%' >> $ap + + tail -n 5 $f | head -n1 | awk '{print $1}' >> $i1 + tail -n 5 $f | head -n1 | awk '{print $2}' >> $i2 + tail -n 5 $f | head -n1 | awk '{print $3}' >> $i3 + tail -n 5 $f | head -n1 | awk '{print $4}' >> $i4 + + tail -n 3 $f | head -n1 | awk '{print $NF}' | tr -d 's' >> $tt + tail -n 2 $f | head -n1 | awk '{print $NF}' | tr -d 's' >> $tpr + tail -n 1 $f | head -n1 | awk '{print $NF}' | tr -d 's' >> $trps + done + + awk 'BEGIN{s=0};{s+=$1};END{printf("Total reads seen: %d\n", s)}' $t >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("Num ambiguous reads: %d\n", s)}' $a >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("Ambiguity percentage: %2.3f%%\n", s/NR)}' $ap >> $OUT_LOG + echo "" >> $OUT_LOG + echo "Reads demuxed by index:" >> $OUT_LOG + echo -e "1\t2\t3\t4" >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("%d\t", s)}' $i1 >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("%d\t", s)}' $i2 >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("%d\t", s)}' $i3 >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("%d", s)}' $i4 >> $OUT_LOG + echo "" >> $OUT_LOG + echo "" >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("Time (total elapsed): %2.3fs\n", s/NR)}' $tt >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("Time (per read): %2.3fs\n", s/NR)}' $tpr >> $OUT_LOG + awk 'BEGIN{s=0};{s+=$1};END{printf("Time (reads per second): %2.3f\n", s/NR)}' $trps >> $OUT_LOG + echo "" >> $OUT_LOG + + >>> + + output { + File merged_log = out_log_name + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + memory: 4 + " GiB" + disks: "local-disk " + disk_size + " HDD" + boot_disk_gb: 10 + preemptible: 0 + cpu: 2 + } +} + +task SplitBamByContig { + meta { + description : "Split a given bam file into separate files by contig." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + + String prefix = "reads" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam : "Bamfile to be split by contig." + prefix : "Prefix for ouput files." + } + + Int disk_size_gb = 10 + 5*ceil(size(bam, "GB")) + + String contig_list = "contig_list.txt" + + command <<< + /python_scripts/split_reads_by_contig.py ~{bam} ~{prefix} + + ls ~{prefix}.*.bam | sed 's#~{prefix}.\(.*\).bam#\1#' > ~{contig_list} + >>> + + output { + Array[File] contig_bams = glob("~{prefix}*.bam") + Array[String] contig_names = read_lines(contig_list) + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ConvertSplicedBamToGff { + meta { + description : "Convert a given splice aligned bam file into a gff file." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam : "Bamfile to be converted to gff." + } + + String base_name = basename(bam, ".bam") + + Int disk_size_gb = 10 + 5*ceil(size(bam, "GB")) + + command <<< + spliced_bam2gff -S -M ~{bam} > ~{base_name}.gff + >>> + + output { + File gff = "~{base_name}.gff" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-splicedbam2gff:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GffCompare { + meta { + description : "Compare two GFF files" + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File gff_ref + File gff_query + File ref_fasta + File? ref_fasta_index + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gff_ref : "Gff file to be used as a reference." + gff_query : "Gff file to be used as a query (compared against the gff_ref)." + ref_fasta : "Reference fasta file." + ref_fasta_index : "Reference fasta file index." + } + + Int disk_size_gb = 10 + 2*ceil(size(gff_ref, "GB")) + 2*ceil(size(gff_query, "GB")) + 2*ceil(size(ref_fasta, "GB")) + + String query_base_name = basename(gff_query) + String query_gff_dir = sub(gff_query, query_base_name + "$", "") + + command <<< + # Because of how gffcompare works, we need to move the query file to our PWD: + + mv -v ~{gff_query} . + + time /gffcompare/gffcompare \ + -V \ + -r ~{gff_ref} \ + -s ~{ref_fasta} \ + ~{query_base_name} &> ~{prefix}.~{query_base_name}.gffcmp.log + + # Rename some output files so we can disambiguate them later: + mv gffcmp.~{query_base_name}.refmap ~{prefix}.gffcmp.~{query_base_name}.refmap + mv gffcmp.~{query_base_name}.tmap ~{prefix}.gffcmp.~{query_base_name}.tmap + + mv gffcmp.tracking ~{prefix}.~{query_base_name}.gffcmp.tracking + mv gffcmp.loci ~{prefix}.~{query_base_name}.gffcmp.loci + mv gffcmp.annotated.gtf ~{prefix}.~{query_base_name}.gffcmp.annotated.gtf + mv gffcmp.stats ~{prefix}.~{query_base_name}.gffcmp.stats + >>> + + output { + File refmap = "~{prefix}.gffcmp.~{query_base_name}.refmap" + File tmap = "~{prefix}.gffcmp.~{query_base_name}.tmap" + + File tracking = "~{prefix}.~{query_base_name}.gffcmp.tracking" + File loci = "~{prefix}.~{query_base_name}.gffcmp.loci" + File annotated_gtf = "~{prefix}.~{query_base_name}.gffcmp.annotated.gtf" + File stats = "~{prefix}.~{query_base_name}.gffcmp.stats" + File log = "~{prefix}.~{query_base_name}.gffcmp.log" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task RestoreOriginalReadNames { + meta { + description : "Copies the contents of the XM tag to the read name and sets the XM tag to the read name." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam : "Bam file in which to restore the original read names." + } + + Int disk_size_gb = 10 + 2*ceil(size(bam, "GB")) + + command <<< + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + time /python_scripts/restore_original_read_names.py ~{bam} + + # Rename some output files so we can disambiguate them later: + mv out.original_read_names_restored.bam ~{prefix}.bam + + samtools index -@${np} ~{prefix}.bam + >>> + + output { + File bam_out = "~{prefix}.bam" + File bai_out = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CorrectUmisWithSetCover { + meta { + description : "Corrects the UMIs in the given reads using a set cover algorithm" + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + String prefix = "out" + + Boolean is_extracted = true + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam : "Bam file for which to correct UMIs." + prefix : "Prefix to assign to output files." + } + + Int disk_size_gb = 10 + 10*ceil(size(bam, "GB")) + + String is_extracted_arg = if (is_extracted) then "--pre-extracted" else "" + + command <<< + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + time /python_scripts/umi_correction.py \ + --input_bam ~{bam} \ + --output_bam ~{prefix}.corrected_umis.unsorted.bam \ + --filtered_bam ~{prefix}.corrected_umis.failed_filters.bam \ + ~{is_extracted_arg} \ + --config /python_scripts/umi_correction.yaml + + samtools sort ~{prefix}.corrected_umis.unsorted.bam > ~{prefix}.corrected_umis.bam + samtools index -@${np} ~{prefix}.corrected_umis.bam + >>> + + output { + File corrected_umi_reads = "~{prefix}.corrected_umis.bam" + File corrected_umi_reads_index = "~{prefix}.corrected_umis.bam.bai" + File uncorrected_umi_reads = "~{prefix}.corrected_umis.failed_filters.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 32, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/lib/Transcriptomics/UMI_Tools.wdl b/wdl/lib/Transcriptomics/UMI_Tools.wdl new file mode 100644 index 0000000..bd3c7ce --- /dev/null +++ b/wdl/lib/Transcriptomics/UMI_Tools.wdl @@ -0,0 +1,100 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task Run_Group { + + meta { + description : "Run umi-tools group on a bam file." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File aligned_transcriptome_reads + File aligned_transcriptome_reads_index + + String gene_tag = "XG" + String cell_barcode_tag = "CB" + String umi_tag = "ZU" + + Boolean do_per_cell = true + + String prefix = "umi_tools_group" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 10 + 20*ceil(size(aligned_transcriptome_reads, "GB") + size(aligned_transcriptome_reads_index, "GB")) + + String per_cell_args = if do_per_cell then " --per-cell --cell-tag " + cell_barcode_tag + " " else "" + + String memory_log_file = "memory_use.txt" + + command <<< + + # Set up memory logging daemon: + MEM_LOG_INTERVAL_s=5 + DO_MEMORY_LOG=true + while $DO_MEMORY_LOG ; do + date + date +%s + cat /proc/meminfo + sleep $MEM_LOG_INTERVAL_s + done >> ~{memory_log_file} & + mem_pid=$! + + set -euxo pipefail + + # Run umi-tools group: + umi_tools group \ + --buffer-whole-contig \ + --no-sort-output \ + --per-gene \ + ~{per_cell_args} \ + --gene-tag ~{gene_tag} \ + --extract-umi-method tag \ + --umi-tag ~{umi_tag} \ + -I ~{aligned_transcriptome_reads} \ + --group-out=~{prefix}.tsv \ + --output-bam \ + --log=~{prefix}.log > ~{prefix}.bam + + + # Stop the memory daemon softly. Then stop it hard if it's not cooperating: + set +e + DO_MEMORY_LOG=false + sleep $(($MEM_LOG_INTERVAL_s * 2)) + kill -0 $mem_pid &> /dev/null + if [ $? -ne 0 ] ; then + kill -9 $mem_pid + fi + >>> + + output { + File output_bam = "~{prefix}.bam" + File output_tsv = "~{prefix}.tsv" + File memory_log = "~{memory_log_file}" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 64, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-transcript_utils:0.0.14" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/lib/Utility/AssignChildLongReads.wdl b/wdl/lib/Utility/AssignChildLongReads.wdl new file mode 100644 index 0000000..67db39e --- /dev/null +++ b/wdl/lib/Utility/AssignChildLongReads.wdl @@ -0,0 +1,227 @@ +version 1.0 + +########################################################################################## +## A workflow that performs trio-binning of child long reads given parental (short) reads. +## Based on the trio-canu publication +## De novo assembly of haplotype-resolved genomes with trio binning +## https://www.nature.com/articles/nbt.4277 +## This holds the sub-workflow for +## part two: given the k-mer stats database from part one, classify child long reads +########################################################################################## + +import "../../structs/Structs.wdl" + +# we separate this out based on two concerns: +# 1. we can test out using different k-value when collecting parental k-mer states +# 2. we can collect parental k-mer stats once and classify all children reads (different sibblings, technologies) separately +workflow AssignChildLongReadsGivenParentalKmerStats { + + input{ + + String workdir_name + + String child_long_reads_bucket + + String long_read_platform + + Array[File] meryl_db_files_father + Array[File] meryl_db_files_mother + + File meryl_stats_father + File meryl_stats_mother + + File vm_local_monitoring_script + + Int child_read_assign_threads_est = 36 + Int child_read_assign_memoryG_est = 32 + + Boolean? run_with_debug = false + } + + parameter_meta { + workdir_name: "name of working directory" + child_long_reads_bucket: "GCS bucket path holding FASTA/FASTQ of child long reads" + long_read_platform: "platform of long read sequencing; currently only one of [pacbio-raw, nanopore-raw] is supported" + + meryl_db_files_father: "Meryl databases files on paternal (short) reads" + meryl_db_files_mother: "Meryl databases files on maternal (short) reads" + meryl_stats_father: "Meryl statistics single file on paternal (short) reads" + meryl_stats_mother: "Meryl statistics single file on maternal (short) reads" + + vm_local_monitoring_script: "GCS file holding a resouce monitoring script that runs locally and collects info for a very specific purpose" + meryl_operations_threads_est: "[default-valued] estimate on how many threads to allocate to k-mer stats collection step" + child_read_assign_threads_est: "[default-valued] estimate on how many threads to allocate to the child longread classification step" + child_read_assign_memoryG_est: "[default-valued] estimate on how many GB memory to allocate to the child longread classification step" + run_with_debug: "[optional] whether to run in debug mode (takes significantly more disk space and more logs); defaults to false" + } + + call AssignChildLongReads { + input: + workdir_name = workdir_name, + + meryl_db_files_father = meryl_db_files_father, + meryl_db_files_mother = meryl_db_files_mother, + meryl_stats_father = meryl_stats_father, + meryl_stats_mother = meryl_stats_mother, + + child_long_reads_bucket = child_long_reads_bucket, + long_read_platform = long_read_platform, + + child_read_assign_threads_est = child_read_assign_threads_est, + child_read_assign_memoryG_est = child_read_assign_memoryG_est, + vm_local_monitoring_script = vm_local_monitoring_script, + run_with_debug = run_with_debug + } + + output { + File reads_assigned_to_father = AssignChildLongReads.reads_assigned_to_father + File reads_assigned_to_mother = AssignChildLongReads.reads_assigned_to_mother + File unassigned_reads = AssignChildLongReads.unassigned_reads + } +} + +############################################################### + +# actually assign child long reads +task AssignChildLongReads { + input{ + + String workdir_name + + String child_long_reads_bucket + # currently the following only takes one of [pacbio-raw, nanopore-raw] + String long_read_platform + + Array[File] meryl_db_files_father + Array[File] meryl_db_files_mother + + File meryl_stats_father + File meryl_stats_mother + + Int child_read_assign_threads_est + Int child_read_assign_memoryG_est + + File vm_local_monitoring_script + + Boolean run_with_debug = false + + RuntimeAttr? runtime_attr_override + } + + String extra_args = if (run_with_debug) then "-debug" else " " + String resource_script_name = basename(vm_local_monitoring_script) + + command <<< + set -euo pipefail + + ############################## + # parallel localize the input reads (remove trailing slash first to be safe) + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + child_path=$(echo ~{child_long_reads_bucket} | sed 's:/*$::') + if ! gsutil ls "${child_path}/" | grep -Eq "(fa|fq|fasta|fastq)(.gz)?$"; then + echo "no reads in ~{child_long_reads_bucket}" && exit 1 + fi + echo "===================================" + echo "BEGIN LOCALIZING CHILD LONG READS" + date -u + mkdir child + gsutil ls "${child_path}/" | grep -E "(fa|fq|fasta|fastq)(.gz)?$" | gsutil -mq cp -I child/ + echo "Child localized" + date -u + echo "DONE LOCALIZING CHILD LONG READS" + echo "===================================" + read_input_format=$(gsutil ls "${child_path}/" | grep -Eo "(fa|fq|fasta|fastq)(.gz)?$" | uniq) + + ########## + # prep files from previous stages + mkdir -p workdir/canu-logs workdir/canu-scripts workdir/haplotype/0-kmers/ + mkdir -p workdir/haplotype/0-kmers/haplotype-Father.meryl \ + workdir/haplotype/0-kmers/haplotype-Mother.meryl + for ff in `ls ~{sep=' ' meryl_db_files_father}`; do + mv $ff workdir/haplotype/0-kmers/haplotype-Father.meryl/ + done + for ff in `ls ~{sep=' ' meryl_db_files_mother}`; do + mv $ff workdir/haplotype/0-kmers/haplotype-Mother.meryl/ + done + + mv ~{meryl_stats_father} workdir/haplotype/0-kmers/ + mv ~{meryl_stats_mother} workdir/haplotype/0-kmers/ + + # we need to have these success files to fool canu + touch workdir/haplotype/0-kmers/meryl-count.success + touch workdir/haplotype/0-kmers/meryl-merge.success + touch workdir/haplotype/0-kmers/meryl-subtract.success + + ########## + echo "===================================" + date -u + machine_mem=`echo $(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024 * 1024)))` + hap_mem=$(($machine_mem - 2)) + echo "Limiting hap memory to: ${hap_mem} G" + export MONITOR_MOUNT_POINT="/cromwell_root" + bash ~{vm_local_monitoring_script} &> resources.log & + job_id=$(ps -aux | grep -F ~{resource_script_name} | head -1 | awk '{print $2}') + canu \ + -haplotype \ + -p ~{workdir_name} \ + -d /cromwell_root/workdir/ \ + genomeSize=3.1G \ + beginConfigAt=hap \ + stopAfter=haplotype \ + hapMemory=${hap_mem} \ + -hapNames "Father Mother" \ + -~{long_read_platform} /cromwell_root/child/*${read_input_format} \ + ~{extra_args} || + cat workdir/haplotype/*.out + kill $job_id || true + du -sh workdir/haplotype/* + date -u + echo "===================================" + ########## + + mv workdir/haplotype/haplotype.log . + mv workdir/haplotype/splitHaplotype.000001.out . + mv workdir/haplotype/splitHaplotype.sh . + mv workdir/haplotype/haplotype-*.fasta.gz . + + # save logs and scripts + tar -czf canu-logs.tar.gz workdir/canu-logs + tar -czf canu-scripts.tar.gz workdir/canu-scripts + >>> + + output { + File logs = "canu-logs.tar.gz" + File scripts = "canu-scripts.tar.gz" + + File resource_monitoring_log = "resources.log" + + File assignment_log = "haplotype.log" + File assignment_job_log = "splitHaplotype.000001.out" + File assignment_script = "splitHaplotype.sh" + + File reads_assigned_to_father = "haplotype-Father.fasta.gz" + File reads_assigned_to_mother = "haplotype-Mother.fasta.gz" + File unassigned_reads = "haplotype-unknown.fasta.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: child_read_assign_threads_est, + mem_gb: child_read_assign_memoryG_est, + disk_gb: 500, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "quay.io/broad-long-read-pipelines/canu:v1.9_wdl_patch_varibale_k" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Utility/BAMutils.wdl b/wdl/lib/Utility/BAMutils.wdl new file mode 100644 index 0000000..d850d4e --- /dev/null +++ b/wdl/lib/Utility/BAMutils.wdl @@ -0,0 +1,44 @@ +version 1.0 + +task GetReadGroupInfo { + meta { + desciption: + "Get some read group information Given a single-readgroup BAM. Will fail if the information isn't present." + } + + input { + String uBAM # not using file as call-caching brings not much benefit + + Array[String] keys + } + + parameter_meta { + keys: "A list of requested fields in the RG line, e.g. ID, SM, LB." + } + + command <<< + set -eux + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{uBAM} | grep "^@RG" | tr '\t' '\n' > rh_header.txt + + for attribute in ~{sep=' ' keys}; do + value=$(grep "^${attribute}" rh_header.txt | awk -F ':' '{print $2}') + echo -e "${attribute}\t${value}" >> "result.txt" + done + >>> + + output { + Map[String, String] read_group_info = read_map("result.txt") + } + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 100 HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } +} diff --git a/wdl/lib/Utility/Finalize.wdl b/wdl/lib/Utility/Finalize.wdl new file mode 100644 index 0000000..21004ba --- /dev/null +++ b/wdl/lib/Utility/Finalize.wdl @@ -0,0 +1,360 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task FinalizeToFile { + input { + File file + String outdir + String? name + + File? keyfile + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + file: { + description: "file to finalize", + localization_optional: true + } + keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + outdir: "directory to which files should be uploaded" + name: "name to set for uploaded file" + } + + String gcs_output_dir = sub(outdir, "/+$", "") + String gcs_output_file = gcs_output_dir + "/" + select_first([name, basename(file)]) + + command <<< + set -euxo pipefail + + gsutil -m cp "~{file}" "~{gcs_output_file}" + >>> + + output { + String gcs_path = gcs_output_file + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FinalizeToDir { + input { + Array[File] files + String outdir + + File? keyfile + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + files: { + description: "files to finalize", + localization_optional: true + } + keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + outdir: "directory to which files should be uploaded" + } + + String gcs_output_dir = sub(outdir, "/+$", "") + + command <<< + set -euxo pipefail + + cat ~{write_lines(files)} | gsutil -m cp -I "~{gcs_output_dir}" + >>> + + output { + String gcs_dir = gcs_output_dir + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FinalizeTarGzContents { + meta { + description : "Copies the contents of the given tar.gz file to the specified bucket." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File tar_gz_file + String outdir + + File? keyfile + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + tar_gz_file : "Gzipped tar file whose contents we'll copy." + outdir : "Google cloud path to the destination folder." + + keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + + runtime_attr_override : "[optional] Additional runtime parameters." + } + + # This idiom ensures that we don't accidentally have double-slashes in our GCS paths + String gcs_output_dir = sub(sub(outdir + "/", "/+", "/"), "gs:/", "gs://") + + command <<< + set -euxo pipefail + + mkdir tmp + cd tmp + tar -zxf ~{tar_gz_file} + + gsutil -m cp -r * ~{gcs_output_dir} + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task WriteCompletionFile { + + meta { + description : "Write a file to the given directory indicating the run has completed." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + String outdir + File? keyfile + } + + parameter_meta { + outdir : "Google cloud path to the destination folder." + keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + } + + command <<< + set -euxo pipefail + + completion_file="COMPLETED_AT_$(date +%Y%m%dT%H%M%S).txt" + touch $completion_file + + gsutil cp $completion_file ~{outdir} + >>> + + ######################### + + runtime { + cpu: 1 + memory: 1 + " GiB" + disks: "local-disk " + 10 + " HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } +} + +task WriteNamedFile { + + meta { + description : "Write a file to the given directory with the given name." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + String name + String outdir + File? keyfile + } + + parameter_meta { + name : "Name of the file to write." + outdir : "Google cloud path to the destination folder." + keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + } + + command <<< + set -euxo pipefail + + touch "~{name}" + + gsutil cp "~{name}" ~{outdir} + >>> + + ######################### + + runtime { + cpu: 1 + memory: 1 + " GiB" + disks: "local-disk " + 10 + " HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } +} + +task CompressAndFinalize { + meta { + description: "Gzip a file and finalize" + } + input { + File file + String outdir + String? name + + RuntimeAttr? runtime_attr_override + } + + String base = basename(file) + String out = sub(select_first([name, base]), ".gz$", "") + ".gz" + # THIS IS ABSOLUTELY CRITICAL: DON'T CHANGE TYPE TO FILE, AS CROMWELL WILL TRY TO LOCALIZE THIS NON-EXISTENT FILE + String gcs_output_file = sub(outdir, "/+$", "") + "/" + out + + Int disk_size = 2 * ceil(size(file, "GB")) + + command <<< + set -euxo pipefail + + gzip -vkc ~{file} > "~{base}.gz" + gsutil cp "~{base}.gz" "~{gcs_output_file}" + >>> + + output { + String gcs_path = gcs_output_file + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FinalizeAndCompress { + meta { + description: "Gzip a bunch of files and finalize to the same \'folder\'" + } + input { + Array[File] files + String outdir + + String prefix + + RuntimeAttr? runtime_attr_override + } + + String gcs_output_file = sub(outdir, "/+$", "") + "/" + prefix + "/" + + Int disk_size = 5 * ceil(size(files, "GB")) + + command <<< + set -euxo pipefail + + for ff in ~{sep=' ' files}; + do + base="$(basename -- ${ff})" + mv "${ff}" "${base}" && gzip -vk "${base}" + done + + gsutil -m cp /cromwell_root/*.gz "~{gcs_output_file}" + >>> + + output { + String gcs_path = gcs_output_file + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 7, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Utility/GeneralUtils.wdl b/wdl/lib/Utility/GeneralUtils.wdl new file mode 100644 index 0000000..667452d --- /dev/null +++ b/wdl/lib/Utility/GeneralUtils.wdl @@ -0,0 +1,50 @@ +version 1.0 + +# todo: move all existing, simiar util tasks here +# Hosting utils that just needs basic Linux shell programs + +task TarGZFiles { + meta { + description: + "Zip up a list of files to a tar.gz file." + } + + input { + Array[File] files + String name + } + + command <<< + set -eux + mkdir -p save/ + for ff in ~{sep=' ' files}; do cp "${ff}" save/ ; done + tar -cvzf ~{name}.tar.gz -C save/ . + >>> + + output { + File you_got_it = "~{name}.tar.gz" + } + + runtime { + disks: "local-disk 100 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task GetTodayDate { + meta { + desciption: "Generates a YYYY-MM-DD date of today (when this task is called). UTC." + volatile: true + } + command { + date '+%Y-%m-%d' + } + + output { + String yyyy_mm_dd = read_string(stdout()) + } + runtime { + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } +} diff --git a/wdl/lib/Utility/Hail.wdl b/wdl/lib/Utility/Hail.wdl new file mode 100644 index 0000000..980149a --- /dev/null +++ b/wdl/lib/Utility/Hail.wdl @@ -0,0 +1,75 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task ConvertToHailMT { + meta { + description: "Convert a .vcf.bgz file to a Hail MatrixTable and copy it to a final gs:// URL." + } + + input { + File gvcf + File tbi + + String reference = "GRCh38" + String? ref_fasta + String? ref_fai + String prefix = "out" + + String outdir + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 3*ceil(size(gvcf, "GB")) + + command <<< + set -x + + python3 <>> + + output { + String gcs_path = "~{outdir}/~{prefix}.mt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "hailgenetics/hail:0.2.105" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Utility/JupyterNotebooks.wdl b/wdl/lib/Utility/JupyterNotebooks.wdl new file mode 100644 index 0000000..63abe46 --- /dev/null +++ b/wdl/lib/Utility/JupyterNotebooks.wdl @@ -0,0 +1,265 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task MASIsoSeqReport { + + meta { + description : "Create a report for a given MAS-ISO-Seq run which summarizes the results using a given Jupyter Notebook template." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File notebook_template + + String sample_name + + File subreads_stats + File ccs_reads_stats + File array_elements_stats + File ccs_report_file + + File raw_ccs_bam_file + File array_element_bam_file + File array_elements_genome_aligned + File ccs_rejected_bam_file + + File annotated_bam_file + + File longbow_passed_reads_file + File longbow_failed_reads_file + + File longbow_passed_ccs_reads + File longbow_failed_ccs_reads + File ccs_reclaimable_reads + File ccs_reclaimed_reads + File ccs_rejected_longbow_failed_reads + File raw_array_elements + File ccs_reclaimed_array_elements + + File zmw_stats_json_gz + + File? zmw_subread_stats_file + File? polymerase_read_lengths_file + File? approx_raw_subread_array_lengths + + File? ten_x_metrics_file + String mas_seq_model + + File workflow_dot_file + + String prefix = "" + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + notebook_template : "Jupyter notebook MASSeq template to run with the given data to produce a MASSeq report." + + sample_name : "Name of the MAS-seq sample being analyzed in this report." + + subreads_stats : "Samtools stats file created from the raw subreads from the PacBio instrument." + ccs_reads_stats : "Samtools raw stats file created from the aligned CCS corrected reads from the PacBio instrument." + array_elements_stats : "Samtools raw stats file created from the aligned MASSeq array elements." + + ccs_report_file : "CCS report file from the CCS run for the data from the PacBio instrument." + raw_ccs_bam_file : "Unaligned reads file in BAM format from the CCS process (pre-array splitting)." + + array_element_bam_file : "Transcriptome aligned reads file in BAM format containing aligned MASSeq array elements as individual reads." + array_elements_genome_aligned : "Genome aligned reads file in BAM format containing aligned MASSeq array elements as individual reads." + ccs_rejected_bam_file : "Bam file containing all subreads from zmws that were rejected by CCS." + + annotated_bam_file : "Bam file containing ccs corrected reads with annotated sections in the SG tag." + + longbow_passed_reads_file : "Bam file containing all reads that passed the longbow filter for the model used in this run (both ccs passed and reclaimed)." + longbow_failed_reads_file : "Bam file containing alll reads that failed the longbow filter for the model used in this run (both ccs passed and reclaimed)." + + longbow_passed_ccs_reads : "Bam file containing ccs corrected reads that passed the longbow filter for the model used in this run (CCS Corrected reads ONLY)." + longbow_failed_ccs_reads : "Bam file containing ccs corrected reads that failed the longbow filter for the model used in this run (CCS Corrected reads ONLY)." + ccs_reclaimable_reads : "Bam file containing ccs rejected reads that are deemed to be reclaimable." + ccs_reclaimed_reads : "Bam file containing ccs rejected reads that have been reclaimed." + ccs_rejected_longbow_failed_reads : "Bam file containing ccs reclaimable reads that did not pass longbow filtering and were not reclaimed." + raw_array_elements : "Bam file containing the raw unaligned array elements created from the longbow_passed_reads_file." + ccs_reclaimed_array_elements : "Bam file containing the unaligned array elements created from reclaimed CCS reads." + + zmw_stats_json_gz : "ZMW stats json.gz file from the PacBio instrument." + + zmw_subread_stats_file : "[optional] File containing statistics about the subreads from each ZMW (created by collect_zmw_subread_stats.py in the PBUtils docker container)." + polymerase_read_lengths_file : "[optional] File containing the lengths of each polymerase read from the sequencer (as created by collect_polymerase_read_lengths.py)" + approx_raw_subread_array_lengths : "[optional] File containing the approximate array length information from the raw (pre-ccs) subreads file (created by get_approx_raw_subread_array_lengths.py in the Cartographer docker container)." + + ten_x_metrics_file : "[optional] Stats file from the 10x tool run for the data in this MASSeq run. If not supplied stats will not be displayed in the resulting report." + mas_seq_model : "Built-in mas-seq model to use." + + workflow_dot_file : "DOT file containing the representation of this workflow used to create and analyze the data. This is included in the QC reports (the DOT file can be generated with womtool)." + + prefix : "[optional] Prefix to prepend to the name of the generated report." + runtime_attr_override : "[optional] Runtime attributes struct with which to override the docker container runtime.." + } + + String nb_name = prefix + "report.ipynb" + String html_out = prefix + "report.html" + String pdf_out = prefix + "report.pdf" + + Int disk_size = 20 + 8*ceil(( + size(notebook_template, "GB") + + size(subreads_stats, "GB") + + size(ccs_reads_stats, "GB") + + size(ccs_report_file, "GB") + + size(raw_ccs_bam_file, "GB") + + size(array_element_bam_file, "GB") + + size(ccs_rejected_bam_file, "GB") + + size(annotated_bam_file, "GB") + + size(raw_ccs_bam_file, "GB") + + size(zmw_subread_stats_file, "GB") + + size(polymerase_read_lengths_file, "GB") + + size(ten_x_metrics_file, "GB") + + size(workflow_dot_file, "GB") + )) + + # Handle the optional files: + String ten_x_metrics_file_flag = if defined(ten_x_metrics_file) then "true" else "false" + String zmw_subread_stats_file_flag = if defined(zmw_subread_stats_file) then "true" else "false" + String polymerase_read_lengths_file_flag = if defined(polymerase_read_lengths_file) then "true" else "false" + String approx_raw_subread_array_lengths_flag = if defined(approx_raw_subread_array_lengths) then "true" else "false" + + command <<< + set -euxo pipefail + + # Set up memory logging daemon: + MEM_LOG_INTERVAL_s=5 + DO_MEMORY_LOG=true + while $DO_MEMORY_LOG ; do + date + date +%s + cat /proc/meminfo + sleep $MEM_LOG_INTERVAL_s + done >> memory_log.txt & + mem_pid=$! + + # Copy the notebook template to our current folder: + cp ~{notebook_template} ~{nb_name} + + # Create a template to create the html report with collapsed code: + echo "{%- extends 'full.tpl' -%}" > hidecode.tpl + echo "" >> hidecode.tpl + echo "{% block input_group %}" >> hidecode.tpl + echo " {%- if cell.metadata.get('nbconvert', {}).get('show_code', False) -%}" >> hidecode.tpl + echo " ((( super() )))" >> hidecode.tpl + echo " {%- endif -%}" >> hidecode.tpl + echo "{% endblock input_group %}" >> hidecode.tpl + + # Set some environment variables for the notebook to read in: + export DATE_RUN="$(date)" + export WDL_NAME="PB10xMasSeqArraySingleFlowcell.wdl" + export REPO_INFO="git@github.com:broadinstitute/long-read-pipelines.git" + + # Prepare the config file: + rm -f mas-seq_qc_inputs.config + + echo "~{sample_name}" >> mas-seq_qc_inputs.config + + echo "~{subreads_stats}" >> mas-seq_qc_inputs.config + echo "~{ccs_reads_stats}" >> mas-seq_qc_inputs.config + echo "~{array_elements_stats}" >> mas-seq_qc_inputs.config + echo "~{ccs_report_file}" >> mas-seq_qc_inputs.config + + echo "~{raw_ccs_bam_file}" >> mas-seq_qc_inputs.config + echo "~{array_element_bam_file}" >> mas-seq_qc_inputs.config + echo "~{array_elements_genome_aligned}" >> mas-seq_qc_inputs.config + echo "~{ccs_rejected_bam_file}" >> mas-seq_qc_inputs.config + + echo "~{annotated_bam_file}" >> mas-seq_qc_inputs.config + + echo "~{longbow_passed_reads_file}" >> mas-seq_qc_inputs.config + echo "~{longbow_failed_reads_file}" >> mas-seq_qc_inputs.config + + echo "~{longbow_passed_ccs_reads}" >> mas-seq_qc_inputs.config + echo "~{longbow_failed_ccs_reads}" >> mas-seq_qc_inputs.config + echo "~{ccs_reclaimable_reads}" >> mas-seq_qc_inputs.config + echo "~{ccs_reclaimed_reads}" >> mas-seq_qc_inputs.config + echo "~{ccs_rejected_longbow_failed_reads}" >> mas-seq_qc_inputs.config + echo "~{raw_array_elements}" >> mas-seq_qc_inputs.config + echo "~{ccs_reclaimed_array_elements}" >> mas-seq_qc_inputs.config + + echo "~{zmw_stats_json_gz}" >> mas-seq_qc_inputs.config + + if ~{zmw_subread_stats_file_flag} ; then + echo "~{zmw_subread_stats_file}" >> mas-seq_qc_inputs.config + else + echo "NON-EXISTENT-PLACEHOLDER" >> mas-seq_qc_inputs.config + fi + if ~{polymerase_read_lengths_file_flag} ; then + echo "~{polymerase_read_lengths_file}" >> mas-seq_qc_inputs.config + else + echo "NON-EXISTENT-PLACEHOLDER" >> mas-seq_qc_inputs.config + fi + if ~{approx_raw_subread_array_lengths_flag} ; then + echo "~{approx_raw_subread_array_lengths}" >> mas-seq_qc_inputs.config + else + echo "NON-EXISTENT-PLACEHOLDER" >> mas-seq_qc_inputs.config + fi + + if ~{ten_x_metrics_file_flag} ; then + echo "~{ten_x_metrics_file}" >> mas-seq_qc_inputs.config + else + echo "NON-EXISTENT-PLACEHOLDER" >> mas-seq_qc_inputs.config + fi + echo "~{mas_seq_model}" >> mas-seq_qc_inputs.config + + echo "~{workflow_dot_file}" >> mas-seq_qc_inputs.config + + # Do the conversion: + + # Run the notebook and populate the notebook itself: + jupyter nbconvert --execute ~{nb_name} --to notebook --inplace --no-prompt --no-input --clear-output --debug --ExecutePreprocessor.timeout=None + + # Convert the notebook output we created just above here to the HTML report: + jupyter nbconvert ~{nb_name} --to html --no-prompt --no-input --debug --ExecutePreprocessor.timeout=None + + # Create a tar.gz of the figures directory: + tar -zcf figures.tar.gz figures + + # Create a dummy pickle for process safety: + touch dummy.pickle + + # Stop the memory daemon softly. Then stop it hard if it's not cooperating: + set +e + DO_MEMORY_LOG=false + sleep $(($MEM_LOG_INTERVAL_s * 2)) + kill -0 $mem_pid &> /dev/null + if [ $? -ne 0 ] ; then + kill -9 $mem_pid + fi + >>> + + output { + File populated_notebook = nb_name + File html_report = html_out + File figures_tar_gz = "figures.tar.gz" + File generated_config = "mas-seq_qc_inputs.config" + + Array[File] pickles = glob("*.pickle") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-jupyter_interactive:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" # LOCAL here is a local SSD - much faster and even money with normal disk if preemptible + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Utility/ONTUtils.wdl b/wdl/lib/Utility/ONTUtils.wdl new file mode 100644 index 0000000..1d1deb4 --- /dev/null +++ b/wdl/lib/Utility/ONTUtils.wdl @@ -0,0 +1,189 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task FindSequencingSummaryFiles { + input { + String gcs_input_dir + + RuntimeAttr? runtime_attr_override + } + + String indir = sub(gcs_input_dir, "/$", "") + + command <<< + for summary_file in $(gsutil ls "~{indir}/**sequencing_summary*.txt*") + do + DIR=$(dirname $summary_file) + echo ${DIR} + + gsutil ls "${DIR}" | grep fastq_pass && gsutil ls "${DIR}" | grep fast5_pass + + if [ $? -eq 0 ]; then + FASTQ_COUNT=$(gsutil ls "${DIR}/fastq_pass/*.fastq*" | wc -l) + FAST5_COUNT=$(gsutil ls "${DIR}/fast5_pass/*.fast5*" | wc -l) + + echo "${FASTQ_COUNT} ${FAST5_COUNT}" + + if [ ${FASTQ_COUNT} -eq ${FAST5_COUNT} ]; then + echo $summary_file >> summaries.txt + else + echo "# fastq != # fast5. Skipped." + fi + else + echo "No passing fastq and fast5 files. Skipped." + fi + + echo "" + done + >>> + + output { + Array[String] summary_files = read_lines("summaries.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GetRunInfo { + input { + String final_summary + + RuntimeAttr? runtime_attr_override + } + + command <<< + set -euxo pipefail + + gsutil cat "~{final_summary}" | sed 's/=[[:space:]]*$/=unknown/' | sed 's/=/\t/g' > run_info.txt + >>> + + output { + Map[String, String] run_info = read_map("run_info.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ListFiles { + input { + String sequencing_summary + String suffix + + RuntimeAttr? runtime_attr_override + } + + String indir = sub(sub(sequencing_summary, basename(sequencing_summary), ""), "/$", "") + + command <<< + set -euxo pipefail + + gsutil ls "~{indir}/**.~{suffix}*" | grep -v fail > files.txt + cat files.txt | wc -l > lc.txt + >>> + + output { + File manifest = "files.txt" + Int count = read_int("lc.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task PartitionManifest { + input { + File manifest + Int N + + RuntimeAttr? runtime_attr_override + } + + command <<< + set -euxo pipefail + + split -a 5 -d --additional-suffix=.txt -e -n l/~{N} ~{manifest} manifest_chunk_ + >>> + + output { + Array[File] manifest_chunks = glob("manifest_chunk_*.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + diff --git a/wdl/lib/Utility/PBUtils.wdl b/wdl/lib/Utility/PBUtils.wdl new file mode 100644 index 0000000..5dab3cc --- /dev/null +++ b/wdl/lib/Utility/PBUtils.wdl @@ -0,0 +1,1050 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task FindBams { + input { + String gcs_input_dir + + RuntimeAttr? runtime_attr_override + } + + String indir = sub(gcs_input_dir, "/$", "") + + command <<< + set -euxo pipefail + + gsutil ls "~{indir}/**subreads.bam" > subread_bams.txt + >>> + + output { + Array[String] subread_bams = read_lines("subread_bams.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GetRunInfo { + input { + String bam + String SM + + RuntimeAttr? runtime_attr_override + } + + String gcs_dir = sub(bam, basename(bam), "") + + command <<< + set -x + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + python /usr/local/bin/detect_run_info.py --SM ~{SM} ~{gcs_dir} > run_info.txt + + ((samtools view -H ~{bam} | grep '^@PG' | grep -w 'PN:ccs') 2>/dev/null) | \ + wc -l | \ + sed 's/0/false/' | \ + sed 's/1/true/' \ + > status.txt + >>> + + output { + Map[String, String] run_info = read_map("run_info.txt") + Boolean is_corrected = read_boolean("status.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.38" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ShardLongReads { + input { + File unaligned_bam + File unaligned_pbi + + Int num_shards = 300 + Int num_threads = 8 + + Boolean drop_per_base_N_pulse_tags = false + + String prefix = "shard" + + String zones = "us-central1-c us-central1-b" + Int? num_ssds + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + # when running large scale workflows, we sometimes see errors like the following + # A resource limit has delayed the operation: generic::resource_exhausted: allocating: selecting resources: selecting region and zone: + # no available zones: 2763 LOCAL_SSD_TOTAL_GB (738/30000 available) usage too high + zones: "select which zone (GCP) to run this task" + } + + Int disk_size = if defined(num_ssds) then 1 + 375*select_first([num_ssds]) else 1+3*ceil(size(unaligned_bam, "GB") + size(unaligned_pbi, "GB")) + Int mem = ceil(25*size(unaligned_pbi, "MB")/1000) + String ex = if drop_per_base_N_pulse_tags then "-x fi,fp,ri,rp" else "" + + command <<< + set -x + + samtools view -c ~{unaligned_bam} # to check if file is truncated + + python3 /usr/local/bin/shard_bam.py ~{ex} \ + -n ~{num_shards} \ + -t ~{num_threads} \ + -i ~{unaligned_pbi} \ + -p ~{prefix} \ + ~{unaligned_bam} + >>> + + output { + Array[File] unmapped_shards = glob("~{prefix}*.bam") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_threads, + mem_gb: mem, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.38" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CCS { + input { + File subreads + + Boolean all = true # see https://ccs.how/faq/mode-all.html for details + Boolean kinetics = false # see https://ccs.how/faq/sqiie.html for details + Boolean by_strand = false + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(subreads, "GB")) + String bn = basename(subreads, ".bam") + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + # Move the file from the UUID share to the current folder. + # This will remove the UUID from the file path and allow call caching to work. + infile=$( basename ~{subreads} ) + mv ~{subreads} $infile + + # Run CCS: + ccs ~{true='--all' false='' all} \ + ~{true='--all-kinetics --subread-fallback' false='' kinetics} \ + ~{true='--by-strand' false='' by_strand} \ + --num-threads $num_core \ + --log-file ~{bn}.ccs.log \ + --stderr-json-log \ + --suppress-reports \ + --report-file ~{bn}.ccs_reports.txt \ + --report-json ~{bn}.ccs_reports.json \ + --metrics-json ~{bn}.zmw_metrics.json.gz \ + --hifi-summary-json ~{bn}.hifi_summary.json \ + $infile ~{bn}.ccs_unmapped.bam + >>> + + output { + File consensus = "~{bn}.ccs_unmapped.bam" + File report = "~{bn}.ccs_reports.txt" + File report_json = "~{bn}.ccs_reports.json" + File metrics_json = "~{bn}.zmw_metrics.json.gz" + File hifi_summary_json = "~{bn}.hifi_summary.json" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 12, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.38" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ExtractHifiReads { + input { + File bam + + String sample_name + String library + + String prefix = "hifi" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + sample_name: "we always rely explicitly on input SM name" + library: "this will override the LB: entry on the @RG line" + } + + Int disk_size = 1 + 3*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + extracthifi ~{bam} ~{prefix}.tmp.bam + + samtools view --no-PG -H ~{prefix}.tmp.bam > header.txt + awk '$1 ~ /^@RG/' header.txt > rg_line.txt + if ! grep -qF "LB:" rg_line.txt; then + sed -i "s/$/LB:tbd/" rg_line.txt + fi + if ! grep -qF "SM:" rg_line.txt; then + sed -i "s/$/SM:tbd/" rg_line.txt + fi + # fix LB: + awk -v lib="~{library}" -F '\t' 'BEGIN {OFS="\t"} { for (i=1; i<=NF; ++i) { if ($i ~ "LB:") $i="LB:"lib } print}' \ + rg_line.txt \ + > fixed_rg_line.lb.txt + # fix SM: + awk -v lib="~{sample_name}" -F '\t' 'BEGIN {OFS="\t"} { for (i=1; i<=NF; ++i) { if ($i ~ "SM:") $i="SM:"lib } print}' \ + fixed_rg_line.lb.txt \ + > fixed_rg_line.txt + sed -n '/@RG/q;p' header.txt > first_half.txt + sed -n '/@RG/,$p' header.txt | sed '1d' > second_half.txt + + cat first_half.txt fixed_rg_line.txt second_half.txt > fixed_header.txt + + date + samtools reheader fixed_header.txt ~{prefix}.tmp.bam > ~{prefix}.bam + date + >>> + + output { + File hifi_bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.38" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeCCSReports { + input { + Array[File] reports + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(reports, "GB")) + + command <<< + set -euxo pipefail + + python /usr/local/bin/merge_ccs_reports.py ~{sep=' ' reports} > ~{prefix}.ccs_report.txt + >>> + + output { + File report = "~{prefix}.ccs_report.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.38" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ExtractUncorrectedReads { + input { + File subreads + File consensus + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(subreads, "GB") + size(consensus, "GB")) + + command <<< + set -euxo pipefail + + python3 /usr/local/bin/extract_uncorrected_reads.py -o ~{prefix}.bam ~{subreads} ~{consensus} + >>> + + output { + File uncorrected = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Demultiplex { + input { + File bam + File barcode_file + String prefix = "demux" + Boolean ccs = false + Boolean isoseq = false + Boolean peek_guess = false + Boolean dump_removed = false + Boolean split_bam_named = false + Int peek = 0 + Int min_score = 0 + Int guess = 0 + Int guess_min_count = 0 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + lima \ + ~{if ccs then "--ccs" else ""} \ + ~{if isoseq then "--isoseq" else ""} \ + ~{if peek_guess then "--peek-guess" else ""} \ + ~{if guess > 0 then "--guess ~{guess}" else ""} \ + ~{if guess_min_count > 0 then "--guess-min-count ~{guess_min_count}" else ""} \ + ~{if peek > 0 then "--peek ~{peek}" else ""} \ + ~{if dump_removed then "--dump-removed" else ""} \ + ~{if split_bam_named then "--split-bam-named" else ""} \ + ~{bam} \ + ~{barcode_file} \ + ~{prefix}.bam + + find . -type f -exec ls -lah {} \; + >>> + + output { + Array[File] demux_bams = glob("~{prefix}.*.bam") + File counts = "~{prefix}.lima.counts" + #File guess = "~{prefix}.lima.guess" + File report = "~{prefix}.lima.report" + File summary = "~{prefix}.lima.summary" + File? clips = "~{prefix}.lima.clips" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MakeDetailedDemultiplexingReport { + input { + File report + String type = "png" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(report, "GB")) + + command <<< + set -euxo pipefail + + Rscript /lima_report_detail.R ~{report} ~{type} + >>> + + output { + Array[File] report_files = glob("detail_*~{type}") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MakeSummarizedDemultiplexingReport { + input { + File report + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(report, "GB")) + + command <<< + set -euxo pipefail + + Rscript /lima_report_summary.R ~{report} + >>> + + output { + Array[File] report_files = glob("summary_*.png") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MakePerBarcodeDemultiplexingReports { + input { + File report + String type = "png" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(report, "GB")) + + command <<< + set -x + + grep '>' /Sequel_96_barcodes_v2.fasta | sed 's/>//' | while read -r line ; do + Rscript /lima_report_detail.R ~{report} ~{type} $line + + if [ -f "detail_hq_length_hist_barcoded_or_not.~{type}" ]; then + for f in detail_*; do mv $f $line.$f; done + fi + done + >>> + + output { + Array[File] report_files = glob("*.~{type}") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task RefineTranscriptReads { + input { + File bam + File barcode_file + String prefix = "flnc" + Boolean require_polya = true + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + isoseq3 refine ~{bam} ~{barcode_file} ~{prefix}.bam ~{true='--require-polya' false='' require_polya} + >>> + + output { + File refined_bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 8, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ClusterTranscripts { + input { + File bam + String prefix = "clustered" + Boolean use_qvs = true + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + isoseq3 cluster ~{bam} ~{prefix}.bam --verbose ~{true='--use-qvs' false='' use_qvs} + >>> + + output { + File clustered_bam = "~{prefix}.bam" + File clustered_pbi = "~{prefix}.bam.pbi" + File hq_fasta = "~{prefix}.hq.fasta.gz" + File hq_bam = "~{prefix}.hq.bam" + File hq_pbi = "~{prefix}.hq.bam.pbi" + File lq_fasta = "~{prefix}.lq.fasta.gz" + File lq_bam = "~{prefix}.lq.bam" + File lq_pbi = "~{prefix}.lq.bam.pbi" + File cluster = "~{prefix}.cluster" + File cluster_report_csv = "~{prefix}.cluster_report.csv" + File transcriptset_xml = "~{prefix}.transcriptset.xml" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 64, + mem_gb: 70, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task PolishTranscripts { + input { + File bam + File subreads_bam + File subreads_pbi + String prefix = "polished" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size([bam, subreads_bam], "GB")) + + command <<< + set -euxo pipefail + + isoseq3 polish ~{bam} ~{subreads_bam} ~{prefix}.bam + >>> + + output { + File polished_bam = "~{prefix}.bam" + File polished_fastq = "~{prefix}.hq.fastq.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 24, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Align { + input { + File bam + File ref_fasta + + String sample_name + String? library + String map_preset + + Boolean drop_per_base_N_pulse_tags + + String prefix = "out" + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + sample_name: "we always rely explicitly on input SM name" + library: "this will override the LB: entry on the @RG line" + } + + String median_filter = if map_preset == "SUBREAD" then "--median-filter" else "" + String extra_options = if drop_per_base_N_pulse_tags then " --strip " else "" + + Boolean fix_library_entry = if defined(library) then true else false + + Int disk_size = 1 + 10*ceil(size(bam, "GB") + size(ref_fasta, "GB")) + Int cpus = 16 + Int mem = 96 + + command <<< + set -euxo pipefail + + pbmm2 align ~{bam} ~{ref_fasta} ~{prefix}.pre.bam \ + --preset ~{map_preset} \ + ~{median_filter} \ + --sample ~{sample_name} \ + ~{extra_options} \ + --sort \ + --unmapped + + if ~{fix_library_entry}; then + mv ~{prefix}.pre.bam ~{prefix}.pre.tmp.bam + samtools view --no-PG -H ~{prefix}.pre.tmp.bam > header.txt + awk '$1 ~ /^@RG/' header.txt > rg_line.txt + awk -v lib="~{library}" -F '\t' 'BEGIN {OFS="\t"} { for (i=1; i<=NF; ++i) { if ($i ~ "LB:") $i="LB:"lib } print}' \ + rg_line.txt \ + > fixed_rg_line.txt + sed -n '/@RG/q;p' header.txt > first_half.txt + sed -n '/@RG/,$p' header.txt | sed '1d' > second_half.txt + + cat first_half.txt fixed_rg_line.txt second_half.txt > fixed_header.txt + + date + samtools reheader fixed_header.txt ~{prefix}.pre.tmp.bam > ~{prefix}.pre.bam + rm ~{prefix}.pre.tmp.bam + date + fi + + samtools calmd -b --no-PG ~{prefix}.pre.bam ~{ref_fasta} > ~{prefix}.bam + samtools index ~{prefix}.bam + >>> + + output { + File aligned_bam = "~{prefix}.bam" + File aligned_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: cpus, + mem_gb: mem, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task PBIndex { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + mv ~{bam} ~{basename(bam)} + + pbindex ~{basename(bam)} + >>> + + output { + File pbi = "~{basename(bam)}.pbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CollapseTranscripts { + input { + File bam + String prefix = "out" + Boolean use_qvs = true + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + isoseq3 collapse ~{bam} ~{prefix}.gff + >>> + + output { + File gff = "~{prefix}.gff" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SummarizeCCSReport { + input { + File report + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(report, "GB")) + + command <<< + set -euxo pipefail + + cat ~{report} | grep 'ZMWs input' | awk -F": " '{ print $2 }' > zmws_input.txt + cat ~{report} | grep 'ZMWs pass filters' | awk -F": " '{ print $2 }' | awk '{ print $1 }' > zmws_pass_filters.txt + cat ~{report} | grep 'ZMWs fail filters' | awk -F": " '{ print $2 }' | awk '{ print $1 }' > zmws_fail_filters.txt + cat ~{report} | grep 'ZMWs shortcut filters' | awk -F": " '{ print $2 }' | awk '{ print $1 }' > zmws_shortcut_filters.txt + cat ~{report} | grep 'ZMWs pass filters' | awk -F": " '{ print $2 }' | awk '{ print $2 }' | sed 's/[()%]//g' > zmws_pass_filters_pct.txt + cat ~{report} | grep 'ZMWs fail filters' | awk -F": " '{ print $2 }' | awk '{ print $2 }' | sed 's/[()%]//g' > zmws_fail_filters_pct.txt + cat ~{report} | grep 'ZMWs shortcut filters' | awk -F": " '{ print $2 }' | awk '{ print $2 }' | sed 's/[()%]//g' > zmws_shortcut_filters_pct.txt + >>> + + output { + Float zmws_input = read_float("zmws_input.txt") + Float zmws_pass_filters = read_float("zmws_pass_filters.txt") + Float zmws_fail_filters = read_float("zmws_fail_filters.txt") + Float zmws_shortcut_filters = read_float("zmws_shortcut_filters.txt") + Float zmws_pass_filters_pct = read_float("zmws_pass_filters_pct.txt") + Float zmws_fail_filters_pct = read_float("zmws_fail_filters_pct.txt") + Float zmws_shortcut_filters_pct = read_float("zmws_shortcut_filters_pct.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SummarizeXMLMetadata { + input { + File xml + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(xml, "GB")) + + command <<< + set -euxo pipefail + + cat ~{xml} | grep '' | sed 's///g' | sed 's/<\/pbds:TotalLength>//' | sed 's/\s*//g' > xml_total_length.txt + cat ~{xml} | grep '' | sed 's///g' | sed 's/<\/pbds:NumRecords>//' | sed 's/\s*//g' > xml_num_records.txt + >>> + + output { + Float xml_total_length = read_float("xml_total_length.txt") + Float xml_num_records = read_float("xml_num_records.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SummarizePBI { + input { + File pbi + Int qual_threshold = 0 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(pbi, "GB")) + + command <<< + set -euxo pipefail + + python3 /usr/local/bin/compute_pbi_stats.py -q ~{qual_threshold} ~{pbi} | tee map.txt + >>> + + output { + Map[String, Float] results = read_map("map.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Utility/SoftClipper.wdl b/wdl/lib/Utility/SoftClipper.wdl new file mode 100644 index 0000000..6c4fa19 --- /dev/null +++ b/wdl/lib/Utility/SoftClipper.wdl @@ -0,0 +1,103 @@ +version 1.0 + +task SplitSoftClippedReads { + input { + File reads_fastq + File reference_fasta + Int rounds + Int clipping_threshold + } + + Int disk_size = (4 + rounds) * ceil(size(reads_fastq, "GB")) + String basename = basename(reads_fastq, ".fastq") + + command <<< + set -euxo pipefail + + for i in {1..~{rounds}} + do + if [[ $i -eq 1 ]]; + then + input_fn=~{reads_fastq} + else + input_fn=~{basename}_softclipped_x$((i - 1)).fastq + fi + + minimap2 --eqx -ax map-ont ~{reference_fasta} ${input_fn} \ + | python /soft_clipper.py --split-read-prefix=x${i} --clipping-threshold=~{clipping_threshold} \ + > ~{basename}_softclipped_x${i}.fastq + + done + >>> + + output { + Array[File] split_reads = glob("*_softclipped_*.fastq") + File most_split_read = "~{basename}_softclipped_x~{rounds}.fastq" + } + + runtime { + cpu: 8 + memory: "32 GiB" + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: 10 + preemptible: 0 + maxRetries: 0 + docker: "quay.io/broad-long-read-pipelines/lr-softclipper:0.5.0" + } +} + +task SplitSoftClippedReadsAssisted { + input { + File reads_fastq + File reference_fasta + Int rounds + Int clipping_threshold + Int ref_conflict_threshold + File aid_reference_fasta + } + + Int disk_size = (14 + 6 + rounds) * ceil(size(reads_fastq, "GB")) + String basename = basename(reads_fastq, ".fastq") + + command <<< + set -euxo pipefail + + for i in {1..~{rounds}} + do + if [[ $i -eq 1 ]]; + then + input_fn=~{reads_fastq} + else + input_fn=~{basename}_softclipped_x$((i - 1)).fastq + fi + + minimap2 --eqx -ax map-ont ~{aid_reference_fasta} ${input_fn} > aid_ref.sam + minimap2 --eqx -ax map-ont ~{reference_fasta} ${input_fn} > ref.sam + + cat ref.sam | python /soft_clipper.py \ + --clipping-threshold=~{clipping_threshold} \ + --split-read-prefix=x${i} \ + --ref=aid_ref.sam \ + --ref-diff-threshold=~{ref_conflict_threshold} \ + --write-ref-conflicts-prefix=conflicts_x${i} \ + > ~{basename}_softclipped_x${i}.fastq + + done + >>> + + output { + Array[File] split_reads = glob("*_softclipped_*.fastq") + Array[File] conflicting_alignments = glob("conflicts_*.bam") + File most_split_read = "~{basename}_softclipped_x~{rounds}.fastq" + } + + runtime { + cpu: 8 + memory: "60 GiB" + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: 10 + preemptible: 0 + maxRetries: 0 + docker: "quay.io/broad-long-read-pipelines/lr-softclipper:0.5.0" + } +} diff --git a/wdl/lib/Utility/StringTie2.wdl b/wdl/lib/Utility/StringTie2.wdl new file mode 100644 index 0000000..7cddc3d --- /dev/null +++ b/wdl/lib/Utility/StringTie2.wdl @@ -0,0 +1,167 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task Quantify { + input { + File aligned_bam + File aligned_bai + File gtf + Boolean keep_retained_introns = false + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10*ceil(size([aligned_bam, aligned_bai, gtf], "GB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + stringtie \ + -Lv -p $num_core ~{true='-i' false='' keep_retained_introns} \ + -G ~{gtf} \ + -o ~{prefix}.gtf \ + -A ~{prefix}.gene_abund.out \ + ~{aligned_bam} + >>> + + output { + File st_gtf = "~{prefix}.gtf" + File st_abund = "~{prefix}.gene_abund.out" + } + + # TODO: Debug memory here. Is getting seg fault... More memory doesn't help. + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-stringtie2:2.2.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ExtractTranscriptSequences { + input { + File ref_fasta + File ref_fasta_fai + File gtf + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size([ref_fasta, ref_fasta_fai, gtf], "GB")) + + command <<< + set -euxo pipefail + + gffread -w ~{prefix}.fa -g ~{ref_fasta} ~{gtf} + samtools faidx ~{prefix}.fa + samtools dict ~{prefix}.fa > ~{prefix}.dict + >>> + + output { + File transcripts_fa = "~{prefix}.fa" + File transcripts_fai = "~{prefix}.fa.fai" + File transcripts_dict = "~{prefix}.dict" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-stringtie2:2.2.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CompareTranscriptomes { + input { + File guide_gtf + File new_gtf + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size([guide_gtf, new_gtf], "GB")) + + # TODO: Look into the comment below and figure out why this is happening. + command <<< + set -euxo pipefail + + gffcompare -R -r ~{guide_gtf} -o ~{prefix} ~{new_gtf} + + dir=$(dirname ~{new_gtf}) + mv $dir/*.refmap ~{prefix}.refmap + mv $dir/*.tmap ~{prefix}.tmap + + if [ ! -e ~{prefix}.stats ] ; then + # Sometimes the stats file hasn't been named `.stats` and just has the prefix name. + # I have no idea why. It's very weird. But we don't use this for our output anyway, so + # for now we'll try to use the prefix file itself if it exists: + mv ~{prefix} ~{prefix}.stats + fi + + tree -h + >>> + + output { + File annotated_gtf = "~{prefix}.annotated.gtf" + File loci = "~{prefix}.loci" + File tracking = "~{prefix}.tracking" + File refmap = "~{prefix}.refmap" + File tmap = "~{prefix}.tmap" + File stats = "~{prefix}.stats" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-stringtie2:2.2.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/lib/Utility/Utils.wdl b/wdl/lib/Utility/Utils.wdl new file mode 100644 index 0000000..9574735 --- /dev/null +++ b/wdl/lib/Utility/Utils.wdl @@ -0,0 +1,2390 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task GetDefaultDir { + input { + String workflow_name + + RuntimeAttr? runtime_attr_override + } + + command <<< + NAME=$(cat gcs_localization.sh | grep 'source bucket' | sed 's/# Localize files from source bucket //' | sed 's/ to container.*//' | sed "s/'//g") + + echo "gs://$NAME/results/~{workflow_name}" + >>> + + output { + String path = read_string(stdout()) + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task PrepareManifest { + input { + Array[String] files + + RuntimeAttr? runtime_attr_override + } + + command <<< + echo ~{sep=' ' files} | sed 's/ /\n/g' > manifest.txt + >>> + + output { + File manifest = "manifest.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task EchoManifest { + input { + File manifest + + RuntimeAttr? runtime_attr_override + } + + command <<< + set -euxo pipefail + + cat ~{manifest} + >>> + + output { + File out = stdout() + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ChunkManifest { + input { + File manifest + Int manifest_lines_per_chunk + + RuntimeAttr? runtime_attr_override + } + + command <<< + set -euxo pipefail + + split -a 5 -d --additional-suffix=".txt" -l ~{manifest_lines_per_chunk} ~{manifest} chunk_ + >>> + + output { + Array[File] manifest_chunks = glob("chunk_*") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SortBam { + input { + File input_bam + String prefix = "sorted" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + input_bam: "input BAM" + prefix: "[default-valued] prefix for output BAM" + } + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + samtools sort -@$num_core -o ~{prefix}.bam ~{input_bam} + samtools index ~{prefix}.bam + >>> + + output { + File sorted_bam = "~{prefix}.bam" + File sorted_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# TODO: investigate if samtools is better for this task +# Sort BAM file by coordinate order +# copied from "dsde_pipelines_tasks/BamProcessing.wdl", with +# customization on the runtime block, and "preemptible_tries" taken out +task SortSam { + input { + File input_bam + String output_bam_basename + Int compression_level + + RuntimeAttr? runtime_attr_override + } + + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Float sort_sam_disk_multiplier = 3.25 + Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 + + command { + java -Dsamjdk.compression_level=~{compression_level} -Xms4000m -jar /usr/gitc/picard.jar \ + SortSam \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_basename}.bam \ + SORT_ORDER="coordinate" \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true \ + MAX_RECORDS_IN_RAM=300000 \ + VALIDATION_STRINGENCY=SILENT + } + + output { + File output_bam = "~{output_bam_basename}.bam" + File output_bam_index = "~{output_bam_basename}.bai" + File output_bam_md5 = "~{output_bam_basename}.bam.md5" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MakeChrIntervalList { + input { + File ref_dict + Array[String] filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV'] + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + + command <<< + set -euxo pipefail + + grep '^@SQ' ~{ref_dict} | \ + awk '{ print $2 "\t" 1 "\t" $3 }' | \ + sed 's/[SL]N://g' | \ + grep -v -e '^@HD' ~{true='-e' false='' length(filter) > 0} ~{sep=" -e " filter} | \ + tee chrs.txt + >>> + + output { + Array[Array[String]] chrs = read_tsv("chrs.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FastaToSam { + input { + File fasta + + RuntimeAttr? runtime_attr_override + } + + Float fasta_sam_disk_multiplier = 3.25 + Int disk_size = ceil(fasta_sam_disk_multiplier * size(fasta, "GiB")) + 20 + + command <<< + python /usr/local/bin/prepare_run.py ~{fasta} + >>> + + output { + File output_bam = "unmapped.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CountFastqRecords { + input { + File fastq + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + ceil(2 * size(fastq, "GiB")) + + command <<< + set -euxo pipefail + + FILE="~{fastq}" + if [[ "$FILE" =~ \.fastq$ ]] || [[ "$FILE" =~ \.fq$ ]]; then + cat ~{fastq} | awk '{s++}END{print s/4}' + elif [[ "$FILE" =~ \.fastq.gz$ ]] || [[ "$FILE" =~ \.fq.gz$ ]]; then + zcat ~{fastq} | awk '{s++}END{print s/4}' + fi + >>> + + output { + Int num_records = read_int(stdout()) + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CountFastaRecords { + input { + File fasta + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(fasta, "GiB")) + + command <<< + grep -c '>' ~{fasta} + + exit 0 + >>> + + output { + Int num_records = read_int(stdout()) + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CountBamRecords { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { bam: { localization_optional: true } } + + Int disk_size = 100 + + command <<< + set -eux + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -c ~{bam} > "count.txt" 2>"error.log" + if [[ -f "error.log" ]]; then + if [[ -s "error.log" ]]; then echo "samtools has warn/error" && cat "error.log" && exit 1; fi + fi + >>> + + output { + File? samools_error = "error.log" + Int num_records = read_int("count.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterListOfStrings { + meta { + description : "Filter a given list of files by a query term (essentially pipes the query into grep)." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + Array[String] list_to_filter + String query + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + list_to_filter: "Array of strings to filter by the query." + query: "Term to use to filter the given strings." + } + + + command <<< + set -euxo pipefail + + \grep "~{query}" ~{write_lines(list_to_filter)} > filtered_list.txt + >>> + + output { + Array[String] filtered_list = read_lines("filtered_list.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "ubuntu:hirsute-20210825" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterReadsBySamFlags { + meta { + description : "Filter reads based on sam flags. Reads with ANY of the given flags will be removed from the given dataset." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + String sam_flags + + String extra_args = "" + + String prefix = "filtered_reads" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "BAM file to be filtered." + sam_flags: "Flags for which to remove reads. Reads with ANY of the given flags will be removed from the given dataset." + prefix : "[Optional] Prefix string to name the output file (Default: filtered_reads)." + } + + Int disk_size = 20 + ceil(11 * size(bam, "GiB")) + + command <<< + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + samtools view -h -b -F ~{sam_flags} -@$np ~{extra_args} ~{bam} > ~{prefix}.bam + samtools index -@$np ~{prefix}.bam + >>> + + output { + File output_bam = "~{prefix}.bam" + File output_bam_index = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.26" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task DownsampleSam { + meta { + description : "Downsample the given bam / sam file using Picard/GATK's DownsampleSam tool." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + + Float probability = 0.01 + String strategy = "HighAccuracy" + String prefix = "downsampled_reads" + + Int random_seed = 1 + + String extra_args = "" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "BAM file to be filtered." + probability : "[Optional] Probability that a read will be emitted (Default: 0.01)." + strategy : "[Optional] Strategy to use to downsample the given bam file (Default: HighAccuracy)." + prefix : "[Optional] Prefix string to name the output file (Default: downsampled_reads)." + extra_args : "[Optional] Extra arguments to pass into DownsampleSam." + } + + Int disk_size = 20 + ceil(11 * size(bam, "GiB")) + + command <<< + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + gatk DownsampleSam --VALIDATION_STRINGENCY SILENT --RANDOM_SEED ~{random_seed} -I ~{bam} -O ~{prefix}.bam -S ~{strategy} -P ~{probability} ~{extra_args} + samtools index -@$np ~{prefix}.bam + >>> + + output { + File output_bam = "~{prefix}.bam" + File output_bam_index = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.2.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GrepCountBamRecords { + input { + File bam + String samfilter = "" + String regex + Boolean invert = false + String prefix = "sum" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + ceil(2 * size(bam, "GiB")) + String arg = if invert then "-vc" else "-c" + + command <<< + set -euxo pipefail + + samtools view ~{samfilter} ~{bam} | grep ~{arg} ~{regex} > ~{prefix}.txt + >>> + + output { + Int num_records = read_int("~{prefix}.txt") + File num_records_file = "~{prefix}.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GrepCountUniqueBamRecords { + input { + String bam + String samfilter = "" + String regex + Boolean invert = false + String prefix = "sum" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + ceil(2 * size(bam, "GiB")) + String arg = if invert then "-v" else "" + + command <<< + set -euxo pipefail + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view ~{samfilter} ~{bam} | grep ~{arg} ~{regex} | > ~{prefix}.txt + >>> + + output { + Int num_records = read_int("~{prefix}.txt") + File num_records_file = "~{prefix}.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Sum { + input { + Array[Int] ints + String prefix = "sum" + + RuntimeAttr? runtime_attr_override + } + + command <<< + python -c "print(~{sep="+" ints})" > ~{prefix}.txt + >>> + + output { + Int sum = read_int("~{prefix}.txt") + File sum_file = "~{prefix}.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Uniq { + input { + Array[String] strings + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + + command <<< + set -euxo pipefail + + sort ~{write_lines(strings)} | uniq > uniq.txt + >>> + + output { + Array[String] unique_strings = read_lines("uniq.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Timestamp { + input { + Array[String] dummy_dependencies + + RuntimeAttr? runtime_attr_override + } + + command <<< + date --iso-8601=ns > timestamp.txt + >>> + + output { + String timestamp = read_string("timestamp.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 1, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task BamToTable { + input { + File bam + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(bam, "GB")) + + command <<< + samtools view ~{bam} | perl -n -e '($nm) = $_ =~ /NM:i:(\d+)/; ($as) = $_ =~ /AS:i:(\d+)/; ($za) = $_ =~ /ZA:Z:(\w+|\.)/; ($zu) = $_ =~ /ZU:Z:(\w+|\.)/; ($cr) = $_ =~ /CR:Z:(\w+|\.)/; ($cb) = $_ =~ /CB:Z:(\w+|\.)/; @a = split(/\s+/); print join("\t", $a[0], $a[1], $a[2], $a[3], $a[4], length($a[9]), $nm, $as, $za, $zu, $cr, $cb, $a[1], ($a[1] & 0x1 ? "paired" : "unpaired"), ($a[1] & 0x4 ? "unmapped" : "mapped"), ($a[1] & 0x10 ? "rev" : "fwd"), ($a[1] & 0x100 ? "secondary" : "primary"), ($a[1] & 0x800 ? "supplementary" : "non_supplementary")) . "\n"' | gzip > ~{prefix}.txt.gz + >>> + + output { + File table = "~{prefix}.txt.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ConvertReads { + input { + File reads + String output_format + } + + Int disk_size = 3 * ceil(size(reads, "GB")) + + command <<< + set -euxo pipefail + + filename=~{reads} + input_filetype=${filename##*.} + output_filetype=~{output_format} + + if [[ ($input_filetype == "fastq" || $input_filetype == "fq") && $output_filetype == "fasta" ]]; then + echo "Converting $input_filetype to $output_filetype" + seqkit fq2fa $filename -o tmp.out + elif [ $input_filetype == $output_filetype ]; then + echo "Input filetype is the output filetype" + mv $filename tmp.out + else + echo "ConvertReads does not know how to convert $input_filetype to $output_filetype" + exit 1 + fi + + mv tmp.out converted_reads.$output_filetype + >>> + + output { + File converted_reads = "converted_reads.~{output_format}" + } + + runtime { + cpu: 4 + memory: "8 GiB" + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 0 + docker: "quay.io/broad-long-read-pipelines/lr-pacasus:0.3.0" + } +} + +task BamToBed { + input { + File bam + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + bedtools genomecov -ibam ~{bam} -bg > ~{prefix}.bed + >>> + + output { + File bed = "~{prefix}.bed" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task BamToFastq { + input { + File bam + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 3*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + samtools fastq ~{bam} | gzip > ~{prefix}.fq.gz + >>> + + output { + File reads_fq = "~{prefix}.fq.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeFastqs { + input { + Array[File] fastqs + + String prefix = "merged" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 3 * ceil(size(fastqs, "GB")) + + String disk_type = if disk_size < 375 then "LOCAL" else "HDD" + + Int memory = 8 + + command <<< + FILE="~{fastqs[0]}" + if [[ "$FILE" =~ \.gz$ ]]; then + cat ~{sep=' ' fastqs} > ~{prefix}.fq.gz + else + cat ~{sep=' ' fastqs} | gzip > ~{prefix}.fq.gz + fi + >>> + + output { + File merged_fastq = "~{prefix}.fq.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# A utility to merge several input BAMs into a single BAM. +task MergeBams { + input { + Array[File] bams + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bams: "input array of BAMs to be merged" + prefix: "[default-valued] prefix for output BAM" + } + + Int disk_size = 1 + 4*ceil(size(bams, "GB")) + + command <<< + set -euxo pipefail + + samtools merge \ + -p -c --no-PG \ + -@ 2 \ + --write-index \ + -o "~{prefix}.bam##idx##~{prefix}.bam.bai" \ + ~{sep=" " bams} + >>> + + output { + File merged_bam = "~{prefix}.bam" + File merged_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 20, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Index { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "BAM file to be indexed" + } + + Int disk_size = 1 + 2*ceil(size(bam, "GB")) + + String prefix = basename(bam) + + command <<< + set -euxo pipefail + + mv ~{bam} ~{prefix} + samtools index ~{basename(prefix)} + >>> + + output { + File bai = "~{prefix}.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterReadsWithTagValues { + input { + File bam + String tag + String value_to_remove + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "Input BAM file from which to remove a tag with certain values." + tag: "Name of the tag to target for potential removal." + value_to_remove: "Tag value to use to remove reads. Reads will be removed if they have the given tag with this value." + prefix: "[default-valued] prefix for output BAM" + } + + Int disk_size = 20 + 11*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + java -jar /usr/picard/picard.jar \ + FilterSamReads \ + --VALIDATION_STRINGENCY SILENT \ + --FILTER excludeTagValues \ + --TAG ~{tag} \ + --TAG_VALUE ~{value_to_remove} \ + -I ~{bam} \ + -O ~{prefix}.bam + >>> + + output { + File output_bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 20, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "broadinstitute/picard:2.23.7" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# A utility to subset a BAM to specifed loci +task SubsetBam { + input { + File bam + File bai + String locus + String prefix = "subset" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: { + description: "bam to subset", + localization_optional: true + } + bai: "index for bam file" + locus: "genomic locus to select" + prefix: "prefix for output bam and bai file names" + } + + Int disk_size = 4*ceil(size([bam, bai], "GB")) + + command <<< + set -euxo pipefail + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + + samtools view -bhX ~{bam} ~{bai} ~{locus} > ~{prefix}.bam + samtools index ~{prefix}.bam + >>> + + output { + File subset_bam = "~{prefix}.bam" + File subset_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 10, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# A utility to subset a BAM to specifed loci +task ExcludeRegionsFromBam { + input { + File bam + File bai + Array[String] loci + String prefix = "subset" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 4*ceil(size([bam, bai], "GB")) + + command <<< + set -euxo pipefail + + echo ~{sep=',' loci} | sed 's/,/\n/g' | sed 's/[:-]/\t/g' > regions.bed + samtools view -L regions.bed -hbU ~{prefix}.bam -o /dev/null ~{bam} + samtools index ~{prefix}.bam + >>> + + output { + File subset_bam = "~{prefix}.bam" + File subset_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# A utility to select the first N reads from a BAM file +task SelectFirstNReads { + input { + File bam + Int n + String prefix = "selected" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: { + description: "bam to subset", + localization_optional: true + } + n: "number of reads to select" + prefix: "prefix for output bam file" + } + + Int disk_size = ceil(size(bam, "GB")) + + command <<< + set -x + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + + ((samtools view -H ~{bam}) && (samtools view ~{bam} | head -n ~{n})) | samtools view -b > ~{prefix}.bam + >>> + + output { + File selected_bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 10, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ResilientSubsetBam { + + meta { + description: "For subsetting a high-coverage BAM stored in GCS, without localizing (more resilient to auth. expiration)." + } + + input { + File bam + File bai + + File interval_list_file + String interval_id + String prefix + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: { + localization_optional: true + } + interval_list_file: "a Picard-style interval list file to subset reads with" + interval_id: "an ID string for representing the intervals in the interval list file" + prefix: "prefix for output bam and bai file names" + } + + Array[String] intervals = read_lines(interval_list_file) + + Int disk_size = 4*ceil(size([bam, bai], "GB")) + + String subset_prefix = prefix + "." + interval_id + + command <<< + + # the way this works is the following: + # 0) relying on the re-auth.sh script to export the credentials + # 1) perform the remote sam-view subsetting in the background + # 2) listen to the PID of the background process, while re-auth every 1200 seconds + source /opt/re-auth.sh + set -euxo pipefail + + # see man page for what '-M' means + samtools view \ + -bhX \ + -M \ + -@ 1 \ + --verbosity=8 \ + --write-index \ + -o "~{subset_prefix}.bam##idx##~{subset_prefix}.bam.bai" \ + ~{bam} ~{bai} \ + ~{sep=" " intervals} && exit 0 || { echo "samtools seem to have failed"; exit 77; } & + pid=$! + + set +e + count=0 + while true; do + sleep 1200 && date && source /opt/re-auth.sh + count=$(( count+1 )) + if [[ ${count} -gt 6 ]]; then exit 0; fi + if ! pgrep -x -P $pid; then exit 0; fi + done + >>> + + output { + File subset_bam = "~{subset_prefix}.bam" + File subset_bai = "~{subset_prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 10, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SplitBam { + input { + File bam + File bai + Array[String] filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA'] + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "bam to split" + bai: "index for bam file" + filter: "contigs to ignore" + } + + Int disk_size = 4*ceil(size([bam, bai], "GB")) + + command <<< + set -euxo pipefail + + samtools view -H ~{bam} | \ + grep '^@SQ' | \ + grep -v -e '^@HD' ~{true='-e' false='' length(filter) > 0} ~{sep=" -e " filter} | \ + awk '{ print $2 }' | \ + sed 's/SN://' | + parallel -j+0 "samtools view -bh -o {}.bam ~{bam} {} && samtools index {}.bam" + >>> + + output { + Array[File] subset_bams = glob("*.bam") + Array[File] subset_bais = glob("*.bai") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 8, + mem_gb: 10, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Bamtools { + input { + File bamfile + String cmd + String args + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size_gb = 1 + ceil(2 * size(bamfile, "GiB")) + + command <<< + bamtools ~{cmd} -in ~{bamfile} -out ~{prefix}.bam ~{args} + >>> + + output { + File bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 2, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9.beta" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +task FilterBamOnTag { + input { + File bam + String prefix = "out" + String tag + String expression + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "input BAM file" + prefix: "[default-valued] prefix for output BAM" + } + + Int disk_size = 4*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + bamtools filter -in ~{bam} -out ~{prefix}.bam -tag "~{tag}":"~{expression}" + samtools index ~{prefix}.bam + >>> + + output { + File filtered_bam = "~{prefix}.bam" + File filtered_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task DeduplicateBam { + meta { + description: "Utility to drop (occationally happening) duplicate records in input BAM" + } + + input { + File aligned_bam + File aligned_bai + + Boolean same_name_as_input = true + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 3 * ceil(size(aligned_bam, "GB")) + + String base = basename(aligned_bam, ".bam") + String prefix = if (same_name_as_input) then base else (base + ".dedup") + + command <<< + echo "===========================================================" + echo "collecting duplicate information" + time \ + samtools view -@ 1 "~{aligned_bam}" | \ + awk -F "\t" 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5}' | \ + sort | uniq -d \ + > "~{aligned_bam}".duplicates.txt + echo "===========================================================" + echo "de-duplicating" + time python3 /opt/remove_duplicate_ont_aln.py \ + --prefix "~{prefix}" \ + --annotations "~{aligned_bam}".duplicates.txt \ + "~{aligned_bam}" + echo "===========================================================" + echo "DONE" + samtools index "~{prefix}.bam" + >>> + + output { + File corrected_bam = "~{prefix}.bam" + File corrected_bai = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.10" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Cat { + input { + Array[File] files + Boolean has_header = false + String out = "out.txt" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + files: "text files to combine" + has_header: "files have a redundant header" + out: "[default-valued] output filename" + } + + Int disk_size = 4*ceil(size(files, "GB")) + + command <<< + set -euxo pipefail + + HAS_HEADER=~{true='1' false='0' has_header} + + if [ HAS_HEADER == 1 ]; then + ((head -1 ~{files[0]}) && (cat ~{sep=' ' files} | xargs -n 1 tail -n +2)) > ~{out} + else + cat ~{sep=' ' files} > ~{out} + fi + >>> + + output { + File combined = out + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ListBamContigs { + input { + String bam + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "input BAM from which available contigs should be listed" + } + + Int disk_size = 1 + + command <<< + set -euxo pipefail + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{bam} | grep '^@SQ' | awk '{ print $2 }' | sed 's/SN://' > chrs.txt + >>> + + output { + Array[String] contigs = read_lines("chrs.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ComputeGenomeLength { + input { + File fasta + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + fasta: "FASTA file" + } + + Int disk_size = 2*ceil(size(fasta, "GB")) + + command <<< + set -euxo pipefail + + samtools dict ~{fasta} | \ + grep '^@SQ' | \ + awk '{ print $3 }' | \ + sed 's/LN://' | \ + awk '{ sum += $1 } END { print sum }' > length.txt + >>> + + output { + Float length = read_float("length.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ListFilesOfType { + input { + String gcs_dir + Array[String] suffixes + Boolean recurse = false + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gcs_dir: "input directory" + suffixes: "suffix(es) for files" + recurse: "if true, recurse through subdirectories" + } + + Int disk_size = 1 + String in_dir = sub(gcs_dir, "/$", "") + + command <<< + set -ex + gsutil ls ~{true='-r' false='' recurse} ~{in_dir} > temp.txt + grep -E '(~{sep="|" suffixes})$' temp.txt > files.txt || touch files.txt + if [ ! -s files.txt ]; then echo "None found" && exit 1; fi + >>> + + output { + Array[String] files = read_lines("files.txt") + File manifest = "files.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task StopWorkflow { + input { + String reason + } + command <<< + echo -e "Workflow explicitly stopped because \n ~{reason}." && exit 1 + >>> + runtime {docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest"} +} + +task InferSampleName { + meta { + description: "Infer sample name encoded on the @RG line of the header section. Fails if multiple values found, or if SM ~= unnamedsample." + } + + input { + File bam + File bai + } + + parameter_meta { + bam: { + localization_optional: true + } + } + + command <<< + set -euxo pipefail + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{bam} > header.txt + if ! grep -q '^@RG' header.txt; then echo "No read group line found!" && exit 1; fi + + grep '^@RG' header.txt | sed 's/\t/\n/g' | grep '^SM:' | sed 's/SM://g' | sort | uniq > sample.names.txt + if [[ $(wc -l sample.names.txt) -gt 1 ]]; then echo "Multiple sample names found!" && exit 1; fi + if grep -iq "unnamedsample" sample.names.txt; then echo "Sample name found to be unnamedsample!" && exit 1; fi + >>> + + output { + String sample_name = read_string("sample.names.txt") + } + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 100 HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } +} + +task CheckOnSamplenames { + meta { + description: "Makes sure the provided sample names are all same, i.e. no mixture of sample names" + } + + input { + Array[String] sample_names + } + + command <<< + set -eux + n_sm=$(sort ~{write_lines(sample_names)} | uniq | wc -l | awk '{print $1}') + if [[ ${n_sm} -gt 1 ]]; then echo "Sample mixture!" && exit 1; fi + >>> + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 100 HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 1 + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task FixSampleName { + meta { + desciption: + "This fixes the sample name of a demultiplexed BAM" + } + + input { + File bam + String sample_name + + RuntimeAttr? runtime_attr_override + } + + String prefix = basename(bam, ".bam") + Int disk_size = 3*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + samtools view --no-PG -H ~{bam} > header.txt + awk '$1 ~ /^@RG/' header.txt > rg_line.txt + if ! grep -qF "SM:" rg_line.txt; then + sed -i "s/$/SM:tbd/" rg_line.txt + fi + awk -v lib="~{sample_name}" -F '\t' 'BEGIN {OFS="\t"} { for (i=1; i<=NF; ++i) { if ($i ~ "SM:") $i="SM:"lib } print}' \ + rg_line.txt \ + > fixed_rg_line.txt + + sed -n '/@RG/q;p' header.txt > first_half.txt + sed -n '/@RG/,$p' header.txt | sed '1d' > second_half.txt + + cat first_half.txt fixed_rg_line.txt second_half.txt > fixed_header.txt + cat fixed_header.txt + + mv ~{bam} old.bam + date + samtools reheader fixed_header.txt old.bam > ~{prefix}.bam + date + >>> + + output { + File reheadered_bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# todo: hook this in to all tasks using LOCAL ssds +task ComputeAllowedLocalSSD { + # This exists because of the following error message + # Task PBFlowcell.ShardLongReads:NA:1 failed. The job was stopped before the command finished. PAPI error code 3. + # Execution failed: allocating: creating instance: inserting instance: Number of local SSDs for an instance of type custom-8-15360 + # should be one of [0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24], while [9] is requested. + meta { + description: "Compute the number of LOCAL ssd's allowed by Google" + } + input { + Int intended_gb + } + Int raw = intended_gb / 375 + command <<< + if [[ ~{raw} -lt 1 ]]; then ## we are pushing the boundary here a bit, based on the assumption that input is a convervative estimate + echo "1" > "result.txt" + elif [[ ~{raw} -lt 9 ]]; then + echo ~{raw} > "result.txt" + elif [[ ~{raw} -lt 16 ]]; then + echo "16" > "result.txt" + elif [[ ~{raw} -lt 24 ]]; then + echo "24" > "result.txt" + else + echo "Would request ~{raw} local SSDs, more than possible (24)." && exit 1 + fi + >>> + + output { + Int numb_of_local_ssd = read_int("result.txt") + } + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 100 HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 1 + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task RandomZoneSpewer { + input { + Int num_of_zones + } + + command <<< + set -eux + + # by no means a perfect solution, but that's not desired anyway + all_known_zones=("us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f" "us-east1-b" "us-east1-c" "us-east1-d" "us-east4-a" "us-east4-b" "us-east4-c" "us-west1-a" "us-west1-b" "us-west1-c" "us-west2-a" "us-west2-b" "us-west2-c" "us-west3-a" "us-west3-b" "us-west3-c" "us-west4-a" "us-west4-b" "us-west4-c") + for zone in "${all_known_zones[@]}"; do echo "${zone}" >> zones.txt; done + + shuf zones.txt | head -n ~{num_of_zones} | tr '\n' ' ' > "result.txt" + >>> + + output { + String zones = read_string("result.txt") + } + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 100 HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 1 + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task ShardReads { + input { + File bam + File bam_index + + String prefix = "shard" + + Int num_shards = 10 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + ceil(4 * size(bam, "GiB")) + + String sharded_bam_folder = "sharded_bams" + + command <<< + num_reads=$(samtools idxstats ~{bam} | awk 'BEGIN{s=0}{s+=$3;s+=$4}END{print s}') + + mkdir sharded_bams + + java -jar /usr/picard/picard.jar \ + SplitSamByNumberOfReads \ + --VALIDATION_STRINGENCY SILENT \ + -I ~{bam} \ + -O ~{sharded_bam_folder} \ + -OUT_PREFIX ~{prefix} \ + -N_FILES ~{num_shards} \ + -TOTAL_READS ${num_reads} + >>> + + output { + Array[File] shards = glob("~{sharded_bam_folder}/*.bam") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 20, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9.gamma" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# Get the current timestamp as a string. +# Levergaes the unix `date` command. +# You can enter your own date format string. +# The default date string is: +# %Y%m%d_%H%M%S_%N +# which corresponds to a date of the following form: +# For August 10th, 2020 at 16:06:32.7091 EDT (20:06:32.7091 UTC): +# 20200810_200632_709100000 +# +task GetCurrentTimestampString { + + meta { + # The volatile keyword forces call caching to be turned off, which is + # exactly what we want for this task. + # For more info see: https://cromwell.readthedocs.io/en/stable/optimizations/VolatileTasks/ + volatile: true + } + + input { + String date_format = "%Y%m%d_%H%M%S_%N" + } + + String date_file = "the_date_file.txt" + + command { + date +~{date_format} > ~{date_file} + cat ~{date_file} + } + + # ------------------------------------------------ + # Runtime settings: + runtime { + docker: "ubuntu:19.10" + memory: "512 MB" + disks: "local-disk 10 HDD" + bootDiskSizeGb: "15" + preemptible: 0 + cpu: 1 + } + + output { + String timestamp_string = read_string(date_file) + } +} + + +task GetRawReadGroup { + input { + String gcs_bam_path + + RuntimeAttr? runtime_attr_override + } + + String out_file = "raw_read_group.txt" + + command { + set -x + + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + + # We need to escape the tabs and convert the spaces so that the read group will play nice with downstream processing: + samtools view -H ~{gcs_bam_path} | grep -m1 '^@RG' | sed -e 's@\t@\\t@g' -e 's@ @_@g' > ~{out_file} + + echo "Raw Read Group:" + cat ~{out_file} + } + + output { + String rg = read_string(out_file) + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.30" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FailWithWarning { + input { + String warning + } + command <<< + set -e + + echo "~{warning}" + echo "~{warning}" 1>&2 + false + >>> + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + runtime { + cpu: default_attr.cpu_cores + memory: select_first([default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: default_attr.boot_disk_gb + preemptible: default_attr.preemptible_tries + maxRetries: default_attr.max_retries + docker: default_attr.docker + } +} + +task SplitDelimitedString { + input { + String s + String separate + } + + command <<< + set -eux + + echo ~{s} | tr ~{separate} '\n' > result.txt + >>> + + output { + Array[String] arr = read_lines("result.txt") + } + + runtime { + disks: "local-disk 100 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task ConstructMap { + meta { + desciption: + "Use only when the keys are guaranteed to be unique and the two arrays are of the same length." + } + input { + Array[String] keys + Array[String] values + } + command <<< + set -eux + paste ~{write_lines(keys)} ~{write_lines(values)} > converted.tsv + cat converted.tsv + >>> + + output { + Map[String, String] converted = read_map("converted.tsv") + } + + runtime { + disks: "local-disk 100 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task MapToTsv { + input { + Map[String, Float] my_map + String name_of_file + } + + command <<< + cp ~{write_map(my_map)} ~{name_of_file} + >>> + + output { + File result = "~{name_of_file}" + } + + runtime { + disks: "local-disk 100 HDD" + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } +} diff --git a/wdl/lib/Utility/VariantUtils.wdl b/wdl/lib/Utility/VariantUtils.wdl new file mode 100644 index 0000000..dd3f702 --- /dev/null +++ b/wdl/lib/Utility/VariantUtils.wdl @@ -0,0 +1,628 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task MergePerChrCalls { + input { + Array[File] vcfs + File ref_dict + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(vcfs, "GB")) + 1 + + command <<< + set -euxo pipefail + + VCF_WITH_HEADER=~{vcfs[0]} + GREPCMD="grep" + if [[ ~{vcfs[0]} =~ \.gz$ ]]; then + GREPCMD="zgrep" + fi + + $GREPCMD '^#' $VCF_WITH_HEADER | grep -v -e '^##contig' -e CHROM > header + grep '^@SQ' ~{ref_dict} | awk '{ print "##contig=" }' | sed 's/[SL]N://g' >> header + $GREPCMD -m1 CHROM $VCF_WITH_HEADER >> header + + ((cat header) && ($GREPCMD -h -v '^#' ~{sep=' ' vcfs})) | bcftools sort | bgzip > ~{prefix}.vcf.gz + tabix -p vcf ~{prefix}.vcf.gz + >>> + + output { + File vcf = "~{prefix}.vcf.gz" + File tbi = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 24, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeAndSortVCFs { + meta { + description: "Fast merging & sorting VCFs when the default sorting is expected to be slow" + } + + input { + Array[File] vcfs + + File ref_fasta_fai + File? header_definitions_file + + String prefix + + RuntimeAttr? runtime_attr_override + } + parameter_meta { + header_definitions_file: "a union of definition header lines for input VCFs (related to https://github.com/samtools/bcftools/issues/1629)" + } + + Int sz = ceil(size(vcfs, 'GB')) + Int disk_sz = if sz > 100 then 5 * sz else 375 # it's rare to see such large gVCFs, for now + + Boolean suspected_incomplete_definitions = defined(header_definitions_file) + + Int cores = 8 + + # pending a bug fix (bcftools github issue 1576) in official bcftools release, + # bcftools sort can be more efficient in using memory + Int machine_memory = 48 # 96 + Int work_memory = ceil(machine_memory * 0.8) + + command <<< + set -euxo pipefail + + echo ~{sep=' ' vcfs} | sed 's/ /\n/g' > all_raw_vcfs.txt + + echo "===========================================================" + echo "starting concatenation" && date + echo "===========================================================" + bcftools \ + concat \ + --naive \ + --threads ~{cores-1} \ + -f all_raw_vcfs.txt \ + --output-type v \ + -o concatedated_raw.vcf.gz # fast, at the expense of disk space + for vcf in ~{sep=' ' vcfs}; do rm $vcf ; done + + # this is another bug in bcftools that's hot fixed but not in official release yet + # (see bcftools github issue 1591) + echo "===========================================================" + echo "done concatenation, fixing header of naively concatenated VCF" && date + echo "===========================================================" + if ~{suspected_incomplete_definitions}; then + # a bug from bcftools concat --naive https://github.com/samtools/bcftools/issues/1629 + set +e + zgrep "^##" concatedated_raw.vcf.gz > header.txt + grep -vF 'fileformat' header.txt \ + | grep -vF 'fileDate=' \ + | grep -vF 'source=' \ + | grep -vF 'contig' \ + | grep -vF 'ALT' \ + | grep -vF 'FILTER' \ + | grep -vF 'INFO' \ + | grep -vF 'FORMAT' \ + > tmp.others.txt + touch tmp.other.txt + set -e + zgrep "^#CHROM" concatedated_raw.vcf.gz > tmp.sampleline.txt + cat \ + ~{header_definitions_file} \ + tmp.others.txt \ + tmp.sampleline.txt \ + > fixed.header.txt + rm -f tmp.*.txt && cat fixed.header.txt + + bcftools reheader \ + -h fixed.header.txt \ + -o tmp.wgs.vcf.gz \ + concatedated_raw.vcf.gz + rm concatedated_raw.vcf.gz + else + mv concatedated_raw.vcf.gz tmp.wgs.vcf.gz + fi + bcftools reheader \ + --fai ~{ref_fasta_fai} \ + -o wgs_raw.vcf.gz \ + tmp.wgs.vcf.gz + rm tmp.wgs.vcf.gz + + echo "===========================================================" + echo "starting sort operation" && date + echo "===========================================================" + bcftools \ + sort \ + --temp-dir tm_sort \ + --output-type z \ + -o ~{prefix}.vcf.gz \ + wgs_raw.vcf.gz + bcftools index --tbi --force ~{prefix}.vcf.gz + echo "===========================================================" + echo "done sorting" && date + echo "===========================================================" + >>> + + output { + File vcf = "~{prefix}.vcf.gz" + File tbi = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: cores, + mem_gb: "~{machine_memory}", + disk_gb: disk_sz, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CollectDefinitions { + meta { + description: "Collect (union) various definitions in vcf files, adddressing a bcftols bug: https://github.com/samtools/bcftools/issues/1629" + } + input { + Array[File] vcfs + + RuntimeAttr? runtime_attr_override + } + + Int sz = ceil(size(vcfs, 'GB')) + Int disk_sz = if sz > 100 then 5 * sz else 375 + + command <<< + set -euxo pipefail + + zgrep "^##" ~{vcfs[0]} > header.txt + grep -F '##fileformat' header.txt > tmp.0.txt + grep -F '##fileDate=' header.txt > tmp.1.txt + if grep -q -F '##source=' header.txt; then grep -F 'source=' header.txt > tmp.2.txt; fi + touch tmp.2.txt + grep -F '##contig=' header.txt > tmp.3.txt + + cat tmp*txt > easy.txt && rm tmp*txt + + touch tmp.alt.txt tmp.ft.txt tmp.info.txt tmp.format.txt + for vcf in ~{sep=' ' vcfs}; do + zgrep -F '##ALT=' "${vcf}" >> tmp.alt.txt + zgrep -F '##FILTER=' "${vcf}" >> tmp.ft.txt + zgrep -F '##INFO=' "${vcf}" >> tmp.info.txt + zgrep -F '##FORMAT=' "${vcf}" >> tmp.format.txt + done + for txt in tmp*txt; do + sort "${txt}" | uniq > "${txt}.union" + done + cat tmp.alt.txt.union tmp.ft.txt.union tmp.info.txt.union tmp.format.txt.union > hard.txt + cat easy.txt hard.txt > definitions.union.txt + >>> + + output { + File union_definitions = "definitions.union.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_sz, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GetVCFSampleName { + meta { + description: "Currently mostly used for extracting sample name in fingerprinting genotyped VCF" + } + input { + File fingerprint_vcf + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + fingerprint_vcf: "Assumed to be genotyped, and hold only one sample (other samples will be ignored)." + } + + command <<< + set -eux + + GREPCMD="grep" + if [[ ~{fingerprint_vcf} =~ \.gz$ ]]; then + GREPCMD="zgrep" + fi + "${GREPCMD}" \ + "^#CHROM" \ + ~{fingerprint_vcf} \ + | awk '{print $10}' \ + > sample_name.txt + >>> + + output { + String sample_name = read_string("sample_name.txt") + } + + ################### + runtime { + cpu: 2 + memory: "4 GiB" + disks: "local-disk 50 HDD" + bootDiskSizeGb: 10 + preemptible_tries: 3 + max_retries: 2 + docker:"gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task SubsetVCF { + input { + File vcf_gz + File vcf_tbi + String locus + String prefix = "subset" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size([vcf_gz, vcf_tbi], "GB")) + 1 + + command <<< + set -euxo pipefail + + bcftools view ~{vcf_gz} --regions ~{locus} | bgzip > ~{prefix}.vcf.gz + tabix -p vcf ~{prefix}.vcf.gz + >>> + + output { + File subset_vcf = "~{prefix}.vcf.gz" + File subset_tbi = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-longshot:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ZipAndIndexVCF { + + meta { + description: "gZip plain text VCF and index it." + } + + input { + File vcf + RuntimeAttr? runtime_attr_override + } + + String prefix = basename(vcf, ".vcf") + Int proposed_disk = 3*ceil(size(vcf, "GB")) + 1 + Int disk_size = if (proposed_disk > 100) then proposed_disk else 100 + + command <<< + cp ~{vcf} ~{prefix}.vcf && \ + bgzip -c ~{prefix}.vcf > ~{prefix}.vcf.gz && \ + tabix -p vcf ~{prefix}.vcf.gz && \ + find ./ -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' + >>> + + output { + File vcfgz = "~{prefix}.vcf.gz" + File tbi = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task IndexVCF { + + meta { + description: "Indexing vcf.gz. Note: do NOT use remote index as that's buggy." + } + + input { + File vcf + RuntimeAttr? runtime_attr_override + } + + String prefix = basename(vcf, ".vcf.gz") + Int proposed_disk = 3*ceil(size(vcf, "GB")) + 1 + Int disk_size = if (proposed_disk > 100) then proposed_disk else 100 + + command <<< + cp ~{vcf} ~{prefix}.vcf.gz && \ + tabix -p vcf ~{prefix}.vcf.gz && \ + find ./ -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' + >>> + + output { + File tbi = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FixSnifflesVCF { + input { + File vcf + String sample_name + File? ref_fasta_fai + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + sample_name: "Sniffles infers sample name from the BAM file name, so we fix it here" + ref_fasta_fai: "provide only when the contig section of the input vcf is suspected to be corrupted" + } + + Boolean fix_contigs = defined(ref_fasta_fai) + + Boolean vcf_is_bgzipped = sub(vcf, ".gz", "") != sub(vcf, ".vcf.gz", "") + String local_raw = if vcf_is_bgzipped then "to.be.fixed.vcf.gz" else "to.be.fixed.vcf" + String local_sp_fixed = if vcf_is_bgzipped then "sample.fixed.vcf.gz" else "sample.fixed.vcf" + + String initial_grep_cmd = if vcf_is_bgzipped then "zgrep" else "grep" + + String prefix = if vcf_is_bgzipped then basename(vcf, ".vcf.gz") else basename(vcf, ".vcf") + Int proposed_disk = 3*ceil(size(vcf, "GB")) + 1 + Int disk_size = if (proposed_disk > 100) then proposed_disk else 100 + + command <<< + set -euxo pipefail + + # 1. fix sample information (Sniffles derives VCF SM information from the path to the BAM ......) + cp ~{vcf} ~{local_raw} + echo ~{sample_name} > sample_names.txt + bcftools reheader --samples sample_names.txt -o ~{local_sp_fixed} ~{local_raw} + rm ~{vcf} && rm ~{local_raw} + + #################################################################### + # 2. prep for fixing undefined VCF INFO/FT/FORMAT, also guard against when the VCF is empty + ~{initial_grep_cmd} "^##" ~{local_sp_fixed} > header.txt + ~{initial_grep_cmd} -v "^#" ~{local_sp_fixed} > body.txt || true + if [[ ! -f body.txt ]] || [[ ! -s body.txt ]]; then + echo "input VCF seem to contain only header, but I'll proceed anyway and give you only header" + bcftools \ + sort \ + --temp-dir tm_sort \ + --output-type z \ + -o ~{prefix}.vcf.gz \ + ~{local_sp_fixed} + bcftools index --tbi --force ~{prefix}.vcf.gz + exit 0; + fi + + #################################################################### + # 2.1. more prep for fixing undefined VCF INFO/FT/FORMATs + # get FORMATs in header + if grep -q -F '##FORMAT=<' header.txt; then + grep -F '##FORMAT=<' header.txt | awk -F ',' '{print $1}' | sed 's/##FORMAT= formats.in_header.txt + else + touch formats.in_header.txt + fi + # get FILTERs in header + if grep -q -F '##FILTER=<' header.txt; then + grep -F '##FILTER=<' header.txt | awk -F ',' '{print $1}' | sed 's/##FILTER= filters.in_header.txt + else + touch filters.in_header.txt + fi + # get non-flag INFO in header + if grep -q -F '##INFO=<' header.txt; then + grep -F '##INFO=<' header.txt | grep -vF 'Type=Flag' | awk -F ',' '{print $1}' | sed 's/##INFO= non_flag_info.in_header.txt + else + touch non_flag_info.in_header.txt + fi + # get flag INFO in header + if grep -q -F '##INFO=<' header.txt; then + grep -F '##INFO=<' header.txt | grep -F 'Type=Flag' | awk -F ',' '{print $1}' | sed 's/##INFO= flag_info.in_header.txt + else + touch flag_info.in_header.txt + fi + + # get FORMATs in practice + awk '{print $9}' body.txt | sort | uniq | sed 's/:/\n/g' | sort | uniq > formats.in_vcf.txt + # get FILTERs in practice, guard against no 'PASS' + awk '{print $7}' body.txt | sort | uniq | grep -v "^PASS$" > filters.in_vcf.txt || touch filters.in_vcf.txt + + awk '{print $8}' body.txt | sed 's/;/\n/g' > tmp.info.entries.txt + if grep -q -F '=' tmp.info.entries.txt; then + # get non-flag INFO in practicez + grep -F '=' tmp.info.entries.txt | awk -F '=' '{print $1}' | sort | uniq > non_flag_info.in_vcf.txt + fi + if grep -q -vF '=' tmp.info.entries.txt; then + # get flag INFO in practice + awk '{print $8}' body.txt | sed 's/;/\n/g' | grep -vF '=' | sort | uniq > flag_info.in_vcf.txt + fi + touch non_flag_info.in_vcf.txt + touch flag_info.in_vcf.txt + + echo "I survived. More to go..." + + #################################################################### + # 2.2. more prep for fixing undefined VCF INFO/FT/FORMATs + comm -13 formats.in_header.txt formats.in_vcf.txt > missing.formats.txt + while IFS= read -r line + do + echo "##FORMAT=" >> missing.formats.header + done < missing.formats.txt + + comm -13 filters.in_header.txt filters.in_vcf.txt > missing.filters.txt + while IFS= read -r line + do + echo "##FILTER=" >> missing.filters.header + done < missing.filters.txt + + comm -13 non_flag_info.in_header.txt non_flag_info.in_vcf.txt > missing.non_flag_info.txt + while IFS= read -r line + do + echo "##INFO=" >> missing.non_flag_info.header + done < missing.non_flag_info.txt + + comm -13 flag_info.in_header.txt flag_info.in_vcf.txt > missing.flag_info.txt + while IFS= read -r line + do + echo "##INFO=" >> missing.flag_info.header + done < missing.flag_info.txt + + #################################################################### + # 2. actually fix undefined VCF INFO/FT/FORMATs, if necessary + if find . -maxdepth 1 -type f -name "missing.*.header" 2>/dev/null | grep -q .; then + grep "^##" ~{local_sp_fixed} | grep -v "^##[A-Z]" | grep -vF 'contig=' > first_lines.txt + grep -F "##contig= contigs.txt + grep "^#CHROM" ~{local_sp_fixed} > sample.line.txt + grep "^##" ~{local_sp_fixed} | grep "^##[A-Z]" | sort > existing_definitions.txt + cat existing_definitions.txt missing.*.header | sort > everything.defined.txt + cat first_lines.txt contigs.txt everything.defined.txt sample.line.txt > fixed.header.txt + # print to stdout for checking + grep -vF "##contig= fixed.vcf + rm ~{local_sp_fixed} + else + mv ~{local_sp_fixed} fixed.vcf + fi + + #################################################################### + # 3. fix contigs undefined (in later stages) + if ~{fix_contigs}; then + bcftools reheader \ + --fai ~{ref_fasta_fai} \ + -o fixed.and_contigs.vcf \ + fixed.vcf + mv fixed.and_contigs.vcf fixed.vcf + fi + + #################################################################### + # 4. fix occationally unsorted VCF + bcftools \ + sort \ + --temp-dir tm_sort \ + --output-type z \ + -o ~{prefix}.vcf.gz \ + fixed.vcf + bcftools index --tbi --force ~{prefix}.vcf.gz + >>> + + output { + File sortedVCF = "~{prefix}.vcf.gz" + File tbi = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/Utility/cnv_common_tasks.wdl b/wdl/lib/Utility/cnv_common_tasks.wdl new file mode 100644 index 0000000..53766cf --- /dev/null +++ b/wdl/lib/Utility/cnv_common_tasks.wdl @@ -0,0 +1,610 @@ +version 1.0 + +# Reusing workflow from GATK Git Repository +# https://github.com/broadinstitute/gatk/blob/master/scripts/cnv_wdl/cnv_common_tasks.wdl + +task PreprocessIntervals { + input { + File? intervals + File? blacklist_intervals + File ref_fasta + File ref_fasta_fai + File ref_fasta_dict + Int? padding + Int? bin_length + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 2]) * 1000 + Int command_mem_mb = machine_mem_mb - 500 + + # Determine output filename + String filename = select_first([intervals, "wgs"]) + String base_filename = basename(filename, ".interval_list") + + command <<< + set -eu + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" PreprocessIntervals \ + ~{"-L " + intervals} \ + ~{"-XL " + blacklist_intervals} \ + --reference ~{ref_fasta} \ + --padding ~{default="250" padding} \ + --bin-length ~{default="1000" bin_length} \ + --interval-merging-rule OVERLAPPING_ONLY \ + --output ~{base_filename}.preprocessed.interval_list + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File preprocessed_intervals = "~{base_filename}.preprocessed.interval_list" + } +} + +task AnnotateIntervals { + input { + File intervals + File ref_fasta + File ref_fasta_fai + File ref_fasta_dict + File? mappability_track_bed + File? mappability_track_bed_idx + File? segmental_duplication_track_bed + File? segmental_duplication_track_bed_idx + Int? feature_query_lookahead + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 2]) * 1000 + Int command_mem_mb = machine_mem_mb - 500 + + # Determine output filename + String base_filename = basename(intervals, ".interval_list") + + command <<< + set -eu + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" AnnotateIntervals \ + -L ~{intervals} \ + --reference ~{ref_fasta} \ + ~{"--mappability-track " + mappability_track_bed} \ + ~{"--segmental-duplication-track " + segmental_duplication_track_bed} \ + --feature-query-lookahead ~{default=1000000 feature_query_lookahead} \ + --interval-merging-rule OVERLAPPING_ONLY \ + --output ~{base_filename}.annotated.tsv + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, ceil(size(ref_fasta, "GB")) + 50]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File annotated_intervals = "~{base_filename}.annotated.tsv" + } +} + +task FilterIntervals { + input { + File intervals + File? blacklist_intervals + File? annotated_intervals + Array[File]? read_count_files + Float? minimum_gc_content + Float? maximum_gc_content + Float? minimum_mappability + Float? maximum_mappability + Float? minimum_segmental_duplication_content + Float? maximum_segmental_duplication_content + Int? low_count_filter_count_threshold + Float? low_count_filter_percentage_of_samples + Float? extreme_count_filter_minimum_percentile + Float? extreme_count_filter_maximum_percentile + Float? extreme_count_filter_percentage_of_samples + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 + Int command_mem_mb = machine_mem_mb - 500 + + # Determine output filename + String base_filename = basename(intervals, ".interval_list") + + command <<< + set -eu + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" FilterIntervals \ + -L ~{intervals} \ + ~{"-XL " + blacklist_intervals} \ + ~{"--annotated-intervals " + annotated_intervals} \ + ~{if defined(read_count_files) then "--input " else ""} ~{sep=" --input " read_count_files} \ + --minimum-gc-content ~{default="0.1" minimum_gc_content} \ + --maximum-gc-content ~{default="0.9" maximum_gc_content} \ + --minimum-mappability ~{default="0.9" minimum_mappability} \ + --maximum-mappability ~{default="1.0" maximum_mappability} \ + --minimum-segmental-duplication-content ~{default="0.0" minimum_segmental_duplication_content} \ + --maximum-segmental-duplication-content ~{default="0.5" maximum_segmental_duplication_content} \ + --low-count-filter-count-threshold ~{default="5" low_count_filter_count_threshold} \ + --low-count-filter-percentage-of-samples ~{default="90.0" low_count_filter_percentage_of_samples} \ + --extreme-count-filter-minimum-percentile ~{default="1.0" extreme_count_filter_minimum_percentile} \ + --extreme-count-filter-maximum-percentile ~{default="99.0" extreme_count_filter_maximum_percentile} \ + --extreme-count-filter-percentage-of-samples ~{default="90.0" extreme_count_filter_percentage_of_samples} \ + --interval-merging-rule OVERLAPPING_ONLY \ + --output ~{base_filename}.filtered.interval_list + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, 50]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File filtered_intervals = "~{base_filename}.filtered.interval_list" + } +} + +task CollectCounts { + input { + File intervals + File bam + File bam_idx + File ref_fasta + File ref_fasta_fai + File ref_fasta_dict + Boolean? enable_indexing + String? format + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 + Int command_mem_mb = machine_mem_mb - 1000 + + Boolean enable_indexing_ = select_first([enable_indexing, false]) + + # Sample name is derived from the bam filename + String base_filename = basename(bam, ".bam") + String format_ = select_first([format, "HDF5"]) + String hdf5_or_tsv_or_null_format = + if format_ == "HDF5" then "HDF5" else + (if format_ == "TSV" then "TSV" else + (if format_ == "TSV_GZ" then "TSV" else "null")) # until we can write TSV_GZ in CollectReadCounts, we write TSV and use bgzip + String counts_filename_extension = + if format_ == "HDF5" then "counts.hdf5" else + (if format_ == "TSV" then "counts.tsv" else + (if format_ == "TSV_GZ" then "counts.tsv.gz" else "null")) + String counts_index_filename_extension = + if format_ == "HDF5" then "null" else + (if format_ == "TSV" then "counts.tsv.idx" else + (if format_ == "TSV_GZ" then "counts.tsv.gz.tbi" else "null")) + Boolean do_block_compression = + if format_ == "HDF5" then false else + (if format_ == "TSV" then false else + (if format_ == "TSV_GZ" then true else false)) + String counts_filename = "~{base_filename}.~{counts_filename_extension}" + String counts_filename_for_collect_read_counts = basename(counts_filename, ".gz") + + command <<< + set -eu + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + case ~{format_} in + HDF5 | TSV | TSV_GZ) + ;; + *) + echo "ERROR: Unknown format specified. Format must be one of HDF5, TSV, or TSV_GZ." + exit 1 + ;; + esac + + if [ ~{format_} = "HDF5" ] && [ ~{enable_indexing_} = "true" ]; then + echo "ERROR: Incompatible WDL parameters. Cannot have format = HDF5 and enable_indexing = true." + exit 1 + fi + + if [ ~{hdf5_or_tsv_or_null_format} = "null" ]; then + echo "ERROR: Should never reach here." + exit 1 + fi + + gatk --java-options "-Xmx~{command_mem_mb}m" CollectReadCounts \ + -L ~{intervals} \ + --input ~{bam} \ + --reference ~{ref_fasta} \ + --format ~{default="HDF5" hdf5_or_tsv_or_null_format} \ + --interval-merging-rule OVERLAPPING_ONLY \ + --output ~{counts_filename_for_collect_read_counts} + + if [ ~{do_block_compression} = "true" ]; then + bgzip ~{counts_filename_for_collect_read_counts} + fi + + if [ ~{enable_indexing_} = "true" ]; then + gatk --java-options "-Xmx~{command_mem_mb}m" IndexFeatureFile \ + -I ~{counts_filename} + fi + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + String entity_id = base_filename + File counts = counts_filename + } +} + +task CollectAllelicCounts { + input { + File common_sites + File bam + File bam_idx + File ref_fasta + File ref_fasta_fai + File ref_fasta_dict + Int? minimum_base_quality + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 13]) * 1000 + Int command_mem_mb = machine_mem_mb - 1000 + + # Sample name is derived from the bam filename + String base_filename = basename(bam, ".bam") + + String allelic_counts_filename = "~{base_filename}.allelicCounts.tsv" + + command <<< + set -eu + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + gatk --java-options "-Xmx~{command_mem_mb}m" CollectAllelicCounts \ + -L ~{common_sites} \ + --input ~{bam} \ + --reference ~{ref_fasta} \ + --minimum-base-quality ~{default="20" minimum_base_quality} \ + --output ~{allelic_counts_filename} + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + String entity_id = base_filename + File allelic_counts = allelic_counts_filename + } +} + +# Users should consult the IntervalListTools documentation and/or manually inspect the results of this task +# to ensure that the number of intervals in each shard is as desired, as the logic IntervalListTools uses +# for dividing intervals can yield shards that are unexpectedly larger than num_intervals_per_scatter. +# Depending on their use case, users may want to modify this task to instead use the SCATTER_COUNT option of +# IntervalListTools, which allows the number of shards to be directly specified. +task ScatterIntervals { + input { + File interval_list + Int num_intervals_per_scatter + String? output_dir + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 2]) * 1000 + Int command_mem_mb = machine_mem_mb - 500 + + # If optional output_dir not specified, use "out"; + String output_dir_ = select_first([output_dir, "out"]) + + String base_filename = basename(interval_list, ".interval_list") + + command <<< + set -eu + # IntervalListTools will fail if the output directory does not exist, so we create it + mkdir ~{output_dir_} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + # IntervalListTools behaves differently when scattering to a single or multiple shards, so we do some handling in bash + + # IntervalListTools tries to equally divide intervals across shards to give at least INTERVAL_COUNT in each and + # puts remainder intervals in the last shard, so integer division gives the number of shards + # (unless NUM_INTERVALS < num_intervals_per_scatter and NUM_SCATTERS = 0, in which case we still want a single shard) + NUM_INTERVALS=$(grep -v '@' ~{interval_list} | wc -l) + NUM_SCATTERS=$(echo $((NUM_INTERVALS / ~{num_intervals_per_scatter}))) + + if [ $NUM_SCATTERS -le 1 ]; then + # if only a single shard is required, then we can just rename the original interval list + >&2 echo "Not running IntervalListTools because only a single shard is required. Copying original interval list..." + cp ~{interval_list} ~{output_dir_}/~{base_filename}.scattered.0001.interval_list + else + gatk --java-options "-Xmx~{command_mem_mb}m" IntervalListTools \ + --INPUT ~{interval_list} \ + --SUBDIVISION_MODE INTERVAL_COUNT \ + --SCATTER_CONTENT ~{num_intervals_per_scatter} \ + --OUTPUT ~{output_dir_} + + # output files are named output_dir_/temp_0001_of_N/scattered.interval_list, etc. (N = number of scatters); + # we rename them as output_dir_/base_filename.scattered.0001.interval_list, etc. + ls -v ~{output_dir_}/*/scattered.interval_list | \ + cat -n | \ + while read n filename; do mv $filename ~{output_dir_}/~{base_filename}.scattered.$(printf "%04d" $n).interval_list; done + rm -rf ~{output_dir_}/temp_*_of_* + fi + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + Array[File] scattered_interval_lists = glob("~{output_dir_}/~{base_filename}.scattered.*.interval_list") + } +} + +task PostprocessGermlineCNVCalls { + input { + String entity_id + Array[File] gcnv_calls_tars + Array[File] gcnv_model_tars + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists + File contig_ploidy_calls_tar + Array[String]? allosomal_contigs + Int ref_copy_number_autosomal_contigs + Int sample_index + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 + Int command_mem_mb = machine_mem_mb - 1000 + + String genotyped_intervals_vcf_filename = "genotyped-intervals-~{entity_id}.vcf.gz" + String genotyped_segments_vcf_filename = "genotyped-segments-~{entity_id}.vcf.gz" + String denoised_copy_ratios_filename = "denoised_copy_ratios-~{entity_id}.tsv" + + Array[String] allosomal_contigs_args = if defined(allosomal_contigs) then prefix("--allosomal-contig ", select_first([allosomal_contigs])) else [] + + command <<< + set -eu + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + + # untar calls to CALLS_0, CALLS_1, etc directories and build the command line + # also copy over shard config and interval files + gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) + calling_configs_array=(~{sep=" " calling_configs}) + denoising_configs_array=(~{sep=" " denoising_configs}) + gcnvkernel_version_array=(~{sep=" " gcnvkernel_version}) + sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + calls_args="" + for index in ${!gcnv_calls_tar_array[@]}; do + gcnv_calls_tar=${gcnv_calls_tar_array[$index]} + mkdir -p CALLS_$index/SAMPLE_~{sample_index} + tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_~{sample_index} + cp ${calling_configs_array[$index]} CALLS_$index/ + cp ${denoising_configs_array[$index]} CALLS_$index/ + cp ${gcnvkernel_version_array[$index]} CALLS_$index/ + cp ${sharded_interval_lists_array[$index]} CALLS_$index/ + calls_args="$calls_args --calls-shard-path CALLS_$index" + done + + # untar models to MODEL_0, MODEL_1, etc directories and build the command line + gcnv_model_tar_array=(~{sep=" " gcnv_model_tars}) + model_args="" + for index in ${!gcnv_model_tar_array[@]}; do + gcnv_model_tar=${gcnv_model_tar_array[$index]} + mkdir MODEL_$index + tar xzf $gcnv_model_tar -C MODEL_$index + model_args="$model_args --model-shard-path MODEL_$index" + done + + mkdir contig-ploidy-calls + tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls + + gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ + $calls_args \ + $model_args \ + ~{sep=" " allosomal_contigs_args} \ + --autosomal-ref-copy-number ~{ref_copy_number_autosomal_contigs} \ + --contig-ploidy-calls contig-ploidy-calls \ + --sample-index ~{sample_index} \ + --output-genotyped-intervals ~{genotyped_intervals_vcf_filename} \ + --output-genotyped-segments ~{genotyped_segments_vcf_filename} \ + --output-denoised-copy-ratios ~{denoised_copy_ratios_filename} + + rm -rf CALLS_* + rm -rf MODEL_* + rm -rf contig-ploidy-calls + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File genotyped_intervals_vcf = genotyped_intervals_vcf_filename + File genotyped_segments_vcf = genotyped_segments_vcf_filename + File denoised_copy_ratios = denoised_copy_ratios_filename + } +} + +task CollectSampleQualityMetrics { + input { + File genotyped_segments_vcf + String entity_id + Int maximum_number_events + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 1]) * 1000 + + command <<< + set -eu + NUM_SEGMENTS=$(gunzip -c ~{genotyped_segments_vcf} | grep -v '#' | wc -l) + if [ $NUM_SEGMENTS -lt ~{maximum_number_events} ]; then + echo "PASS" >> ~{entity_id}.qcStatus.txt + else + echo "EXCESSIVE_NUMBER_OF_EVENTS" >> ~{entity_id}.qcStatus.txt + fi + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, 20]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File qc_status_file = "~{entity_id}.qcStatus.txt" + String qc_status_string = read_string("~{entity_id}.qcStatus.txt") + } +} + +task CollectModelQualityMetrics { + input { + Array[File] gcnv_model_tars + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int machine_mem_mb = select_first([mem_gb, 1]) * 1000 + + command <<< + sed -e + qc_status="PASS" + + gcnv_model_tar_array=(~{sep=" " gcnv_model_tars}) + for index in ${!gcnv_model_tar_array[@]}; do + gcnv_model_tar=${gcnv_model_tar_array[$index]} + mkdir MODEL_$index + tar xzf $gcnv_model_tar -C MODEL_$index + ard_file=MODEL_$index/mu_ard_u_log__.tsv + + #check whether all values for ARD components are negative + NUM_POSITIVE_VALUES=$(awk '{ if (index($0, "@") == 0) {if ($1 > 0.0) {print $1} }}' MODEL_$index/mu_ard_u_log__.tsv | wc -l) + if [ $NUM_POSITIVE_VALUES -eq 0 ]; then + qc_status="ALL_PRINCIPAL_COMPONENTS_USED" + break + fi + done + echo $qc_status >> qcStatus.txt + >>> + + runtime { + docker: gatk_docker + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File qc_status_file = "qcStatus.txt" + String qc_status_string = read_string("qcStatus.txt") + } +} diff --git a/wdl/lib/VariantCalling/CCSPepper.wdl b/wdl/lib/VariantCalling/CCSPepper.wdl new file mode 100644 index 0000000..492f12a --- /dev/null +++ b/wdl/lib/VariantCalling/CCSPepper.wdl @@ -0,0 +1,332 @@ +version 1.0 + +####################################################### +# This pipeline calls small variants using DeepVariant. +####################################################### + +import "../../structs/Structs.wdl" + + +workflow CCSPepper { + + meta { + description: "Workflow for getting haplotagged BAM, VCF and gVCF from DV-pepper. Note VCF is un-phased." + } + + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + Int pepper_threads + Int pepper_memory + + Int dv_threads + Int dv_memory + + String zones = "us-central1-b us-central1-c" + } + + parameter_meta { + # when running large scale workflows, we sometimes see errors like the following + # A resource limit has delayed the operation: generic::resource_exhausted: allocating: selecting resources: selecting region and zone: + # no available zones: 2763 LOCAL_SSD_TOTAL_GB (738/30000 available) usage too high + zones: "select which zone (GCP) to run this task" + } + + call Pepper as get_hap_tagged_bam { + input: + bam = bam, + bai = bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + threads = pepper_threads, + memory = pepper_memory, + zones = zones + } + + call DV as deep_variant { + input: + bam = get_hap_tagged_bam.hap_tagged_bam, + bai = get_hap_tagged_bam.hap_tagged_bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + threads = dv_threads, + memory = dv_memory, + zones = zones + } + + output { + File VCF = deep_variant.VCF + File VCF_tbi = deep_variant.VCF_tbi + + File gVCF = deep_variant.gVCF + File gVCF_tbi = deep_variant.gVCF_tbi + + File hap_tagged_bam = get_hap_tagged_bam.hap_tagged_bam + File hap_tagged_bai = get_hap_tagged_bam.hap_tagged_bai + } +} + +task Pepper { + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + Int threads + Int memory + String zones + + RuntimeAttr? runtime_attr_override + } + + Int bam_sz = ceil(size(bam, "GB")) + Int disk_size = if bam_sz > 200 then 2*bam_sz else bam_sz + 200 + + String output_root = "/cromwell_root/pepper_output" + + String prefix = basename(bam, ".bam") + ".pepper" + + command <<< + # avoid the infamous pipefail 141 https://stackoverflow.com/questions/19120263 + set -eux + SM=$(samtools view -H ~{bam} | grep -m1 '^@RG' | sed 's/\t/\n/g' | grep '^SM:' | sed 's/SM://g') + + set -euxo pipefail + + touch ~{bai} + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" + + # no gVCF as it Pepper simply doesn't produce gVCF on CCS data + run_pepper_margin_deepvariant \ + call_variant \ + -b ~{bam} \ + -f ~{ref_fasta} \ + -t "${num_core}" \ + -s "${SM}" \ + -o "~{output_root}" \ + -p "~{prefix}" \ + --phased_output \ + --ccs + + find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ + > "~{output_root}/dir_structure.txt" + + if [[ -f "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" ]]; then + mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" \ + "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" + mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" \ + "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" + fi + >>> + + output { + File hap_tagged_bam = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" + File hap_tagged_bai = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" + + # maybe less useful + File output_dir_structure = "~{output_root}/dir_structure.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: threads, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 100, + preemptible_tries: 1, + max_retries: 1, + docker: "kishwars/pepper_deepvariant:r0.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task DV { + + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + Int threads + Int memory + String zones + + RuntimeAttr? runtime_attr_override + } + + String prefix = basename(bam, ".bam") + ".deepvariant" + String output_root = "/cromwell_root/dv_output" + + Int bam_sz = ceil(size(bam, "GB")) + Boolean is_big_bam = bam_sz > 100 + Int inflation_factor = if (is_big_bam) then 10 else 5 + Int minimal_disk = 1000 + Int disk_size = if inflation_factor * bam_sz > minimal_disk then inflation_factor * bam_sz else minimal_disk + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" + + export MONITOR_MOUNT_POINT="/cromwell_root/" + bash vm_local_monitoring_script.sh &> resources.log & + job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') + + /opt/deepvariant/bin/run_deepvariant \ + --model_type=PACBIO \ + --ref=~{ref_fasta} \ + --reads=~{bam} \ + --output_vcf="~{output_root}/~{prefix}.vcf.gz" \ + --output_gvcf="~{output_root}/~{prefix}.g.vcf.gz" \ + --num_shards="${num_core}" \ + --use_hp_information || cat resources.log + if ps -p "${job_id}" > /dev/null; then kill "${job_id}"; fi + + find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ + > "~{output_root}/dir_structure.txt" + >>> + + output { + + File resouce_monitor_log = "resources.log" + + File output_dir_structure = "~{output_root}/dir_structure.txt" + + File VCF = "~{output_root}/~{prefix}.vcf.gz" + File VCF_tbi = "~{output_root}/~{prefix}.vcf.gz.tbi" + + File gVCF = "~{output_root}/~{prefix}.g.vcf.gz" + File gVCF_tbi = "~{output_root}/~{prefix}.g.vcf.gz.tbi" + + File visual_report_html = "~{output_root}/~{prefix}.visual_report.html" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: threads, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 100, + preemptible_tries: 3, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-deepvariant:1.3.0" + # docker: "google/deepvariant:1.2.0-gpu" # kept here to remind ourselves, occassionally, to review if it's better with GPU + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MarginPhase { + + meta { + description: "Generates phased VCF. Note this runs fast so no need to parallize." + } + + input { + File bam + File bai + + File unphased_vcf + File? unphased_vcf_tbi + + File ref_fasta + File ref_fasta_fai + + Int memory + String zones + + RuntimeAttr? runtime_attr_override + } + + Int bam_sz = ceil(size(bam, "GB")) + Int disk_size = if bam_sz > 200 then 2*bam_sz else bam_sz + 200 + + Int cores = 64 + + String prefix = basename(bam, ".bam") + ".pepper" + String output_root = "/cromwell_root/margin_output" + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" "~{output_root}/logs" + touch ~{bai} + + # note the -M option was suggested by an author of margin + # it's unclear which phasedBAM one should use: this, or the one generated from the Pepper step + margin phase \ + ~{bam} \ + ~{ref_fasta} \ + ~{unphased_vcf} \ + /opt/margin_dir/params/misc/allParams.phase_vcf.json \ + -t "${num_core}" \ + -M \ + -o "~{output_root}/~{prefix}" \ + 2>&1 | tee "~{output_root}/logs/5_margin_phase_vcf.log" + + bgzip -c "~{output_root}/~{prefix}".phased.vcf > "~{output_root}/~{prefix}".phased.vcf.gz && \ + tabix -p vcf "~{output_root}/~{prefix}".phased.vcf.gz + >>> + + + output { + File phaseset_bed = "~{output_root}/~{prefix}.phaseset.bed" + File phasedVCF = "~{output_root}/~{prefix}.phased.vcf.gz" + File phasedtbi = "~{output_root}/~{prefix}.phased.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: cores, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 100, + preemptible_tries: 3, + max_retries: 0, + docker: "kishwars/pepper_deepvariant:r0.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/VariantCalling/CallAssemblyVariants.wdl b/wdl/lib/VariantCalling/CallAssemblyVariants.wdl new file mode 100644 index 0000000..5df4555 --- /dev/null +++ b/wdl/lib/VariantCalling/CallAssemblyVariants.wdl @@ -0,0 +1,132 @@ +version 1.0 + +import "../../structs/Structs.wdl" +import "../Utility/Utils.wdl" as Utils + +workflow CallAssemblyVariants { + input { + File asm_fasta + File ref_fasta + String participant_name + String prefix + } + + parameter_meta { + asm_fasta: "haploid assembly" + ref_fasta: "reference to which assembly should be aligned" + participant_name: "participant name" + prefix: "prefix for output files" + } + + call AlignAsPAF { + input: + ref_fasta = ref_fasta, + asm_fasta = asm_fasta, + prefix = prefix + } + + call Paftools { + input: + ref_fasta = ref_fasta, + paf = AlignAsPAF.paf, + participant_name = participant_name, + prefix = prefix + } + + output { + File paf = AlignAsPAF.paf + File paftools_vcf = Paftools.variants + } +} + +task AlignAsPAF { + input { + File ref_fasta + File asm_fasta + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 4*ceil(size(ref_fasta, "GB") + size(asm_fasta, "GB")) + Int num_cpus = 4 + + command <<< + set -euxo pipefail + + minimap2 --paf-no-hit -cx asm20 --cs -r 2k -t ~{num_cpus} \ + ~{ref_fasta} ~{asm_fasta} | \ + gzip -1 > ~{prefix}.paf.gz + >>> + + output { + File paf = "~{prefix}.paf.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 40, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-asm:0.1.13" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Paftools { + input { + File ref_fasta + File paf + String participant_name + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(ref_fasta, "GB") + size(paf, "GB")) + Int num_cpus = 1 + + command <<< + zcat ~{paf} | \ + sort -k6,6 -k8,8n | \ + paftools.js call -f ~{ref_fasta} -s ~{participant_name} - \ + > ~{prefix}.paftools.vcf + >>> + + output { + File variants = "~{prefix}.paftools.vcf" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 20, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-asm:0.1.13" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/VariantCalling/CallVariantsONT.wdl b/wdl/lib/VariantCalling/CallVariantsONT.wdl new file mode 100644 index 0000000..0ba8d9e --- /dev/null +++ b/wdl/lib/VariantCalling/CallVariantsONT.wdl @@ -0,0 +1,273 @@ +version 1.0 + +import "../Utility/Utils.wdl" +import "../Utility/VariantUtils.wdl" +import "PBSV.wdl" +import "Sniffles2.wdl" as Sniffles2 +import "Clair.wdl" as Clair3 +import "ONTPepper.wdl" + +workflow CallVariants { + meta { + descrition: "A workflow for calling small and/or structural variants from an aligned ONT BAM file." + } + input { + File bam + File bai + Int minsvlen = 50 + String prefix + String sample_id + + File ref_fasta + File ref_fasta_fai + File ref_dict + + Boolean call_svs + Boolean fast_less_sensitive_sv + File? tandem_repeat_bed + + Boolean call_small_variants + Boolean call_small_vars_on_mitochondria + File? sites_vcf + File? sites_vcf_tbi + + Boolean run_dv_pepper_analysis + Int? dvp_threads + Int? dvp_memory + File? ref_scatter_interval_list_locator + File? ref_scatter_interval_list_ids + } + + parameter_meta { + fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed" + tandem_repeat_bed: "BED file containing TRF finder for better PBSV calls (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)" + minsvlen : "Minimum SV length in bp (default: 50)" + call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria" + sites_vcf: "for use with Clair" + sites_vcf_tbi: "for use with Clair" + + run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)" + ref_scatter_interval_list_locator: "A file holding paths to interval_list files; needed only when running DV-Pepper" + ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; needed only when running DV-Pepper" + } + + ###################################################################### + # Block for small variants handling + ###################################################################### + + call Utils.RandomZoneSpewer as arbitrary {input: num_of_zones = 3} + + # todo: merge the two scattering scheme into a better one + if (call_small_variants) { + # Scatter by chromosome + Array[String] default_filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV'] + Array[String] use_filter = if (call_small_vars_on_mitochondria) then default_filter else flatten([['chrM'],default_filter]) + call Utils.MakeChrIntervalList as SmallVariantsScatterPrepp { + input: + ref_dict = ref_dict, + filter = use_filter + } + + scatter (c in SmallVariantsScatterPrepp.chrs) { + String chr = c[0] + + call Utils.SubsetBam as SmallVariantsScatter { + input: + bam = bam, + bai = bai, + locus = chr + } + + call Clair3.Clair { + input: + bam = SmallVariantsScatter.subset_bam, + bai = SmallVariantsScatter.subset_bai, + + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + + sites_vcf = sites_vcf, + sites_vcf_tbi = sites_vcf_tbi, + + preset = "ONT", + zones = arbitrary.zones + } + } + + call VariantUtils.MergeAndSortVCFs as MergeAndSortClairVCFs { + input: + vcfs = Clair.vcf, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix + ".clair" + } + + call VariantUtils.MergeAndSortVCFs as MergeAndSortClair_gVCFs { + input: + vcfs = Clair.gvcf, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix + ".clair.g" + } + + # size-balanced scatter + if (run_dv_pepper_analysis) { + File scatter_interval_list_ids = select_first([ref_scatter_interval_list_ids]) + File scatter_interval_list_loc = select_first([ref_scatter_interval_list_locator]) + Array[String] interval_list_ids = read_lines(scatter_interval_list_ids) + Array[String] interval_list_files = read_lines(scatter_interval_list_loc) + Array[Pair[String, String]] ided_interval_list_files = zip(interval_list_ids, interval_list_files) + + scatter (pair in ided_interval_list_files) { + call Utils.ResilientSubsetBam as size_balanced_scatter { + input: + bam = bam, + bai = bai, + interval_list_file = pair.right, + interval_id = pair.left, + prefix = basename(bam, ".bam") + } + + call ONTPepper.Pepper { + input: + bam = size_balanced_scatter.subset_bam, + bai = size_balanced_scatter.subset_bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + threads = select_first([dvp_threads]), + memory = select_first([dvp_memory]), + zones = arbitrary.zones + } + } + + String dvp_prefix = prefix + ".deepvariant_pepper" + + call VariantUtils.MergeAndSortVCFs as MergeDeepVariantGVCFs { + input: + vcfs = Pepper.gVCF, + prefix = dvp_prefix + ".g", + ref_fasta_fai = ref_fasta_fai + } + + call VariantUtils.MergeAndSortVCFs as MergeDeepVariantPhasedVCFs { + input: + vcfs = Pepper.phasedVCF, + prefix = dvp_prefix + ".phased", + ref_fasta_fai = ref_fasta_fai + } + + call VariantUtils.MergeAndSortVCFs as MergeDeepVariantVCFs { + input: + vcfs = Pepper.VCF, + prefix = dvp_prefix, + ref_fasta_fai = ref_fasta_fai + } + + call Utils.MergeBams { + input: + bams = Pepper.hap_tagged_bam, + prefix = prefix + ".MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged" + } + } + } + ###################################################################### + # Block for SV handling + ###################################################################### + if (call_svs) { + if (fast_less_sensitive_sv) { + + call Utils.MakeChrIntervalList { + input: + ref_dict = ref_dict, + filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV'] + } + + scatter (c in MakeChrIntervalList.chrs) { + String contig = c[0] + + call Utils.SubsetBam { + input: + bam = bam, + bai = bai, + locus = contig + } + + call PBSV.RunPBSV { + input: + bam = SubsetBam.subset_bam, + bai = SubsetBam.subset_bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix, + tandem_repeat_bed = tandem_repeat_bed, + is_ccs = false, + zones = arbitrary.zones + } + + call Utils.InferSampleName { + input: + bam = SubsetBam.subset_bam, + bai = SubsetBam.subset_bai + } + + } + + call VariantUtils.MergePerChrCalls as MergePBSVVCFs { + input: + vcfs = RunPBSV.vcf, + ref_dict = ref_dict, + prefix = prefix + ".pbsv" + } + + } + + if (!fast_less_sensitive_sv) { + call PBSV.RunPBSV as PBSVslow { + input: + bam = bam, + bai = bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix, + tandem_repeat_bed = tandem_repeat_bed, + is_ccs = false, + zones = arbitrary.zones + } + + call VariantUtils.ZipAndIndexVCF as ZipAndIndexPBSV {input: vcf = PBSVslow.vcf } + } + + call Sniffles2.SampleSV as Sniffles2SV { + input: + bam = bam, + bai = bai, + minsvlen = minsvlen, + sample_id = sample_id, + prefix = prefix + } + + call VariantUtils.ZipAndIndexVCF as ZipAndIndexSnifflesVCF { + input: + vcf = Sniffles2SV.vcf + } + } + + output { + File? sniffles_vcf = ZipAndIndexSnifflesVCF.vcfgz + File? sniffles_tbi = ZipAndIndexSnifflesVCF.tbi + + File? pbsv_vcf = select_first([MergePBSVVCFs.vcf, ZipAndIndexPBSV.vcfgz]) + File? pbsv_tbi = select_first([MergePBSVVCFs.tbi, ZipAndIndexPBSV.tbi]) + + File? clair_vcf = MergeAndSortClairVCFs.vcf + File? clair_tbi = MergeAndSortClairVCFs.tbi + + File? clair_gvcf = MergeAndSortClair_gVCFs.vcf + File? clair_gtbi = MergeAndSortClair_gVCFs.tbi + + File? dvp_phased_vcf = MergeDeepVariantPhasedVCFs.vcf + File? dvp_phased_tbi = MergeDeepVariantPhasedVCFs.tbi + File? dvp_g_vcf = MergeDeepVariantGVCFs.vcf + File? dvp_g_tbi = MergeDeepVariantGVCFs.tbi + File? dvp_vcf = MergeDeepVariantVCFs.vcf + File? dvp_tbi = MergeDeepVariantVCFs.tbi + } +} diff --git a/wdl/lib/VariantCalling/CallVariantsPBCCS.wdl b/wdl/lib/VariantCalling/CallVariantsPBCCS.wdl new file mode 100644 index 0000000..a713d47 --- /dev/null +++ b/wdl/lib/VariantCalling/CallVariantsPBCCS.wdl @@ -0,0 +1,279 @@ +version 1.0 + +import "../Utility/Utils.wdl" +import "../Utility/VariantUtils.wdl" +import "PBSV.wdl" +import "Sniffles2.wdl" as Sniffles2 +import "Clair.wdl" as Clair3 +import "CCSPepper.wdl" + +workflow CallVariants { + meta { + description: "A workflow for calling small and/or structural variants from an aligned CCS BAM file." + } + input { + File bam + File bai + Int minsvlen = 50 + String prefix + String sample_id + + File ref_fasta + File ref_fasta_fai + File ref_dict + + Boolean call_svs + Boolean fast_less_sensitive_sv + File? tandem_repeat_bed + + Boolean call_small_variants + Boolean call_small_vars_on_mitochondria + File? sites_vcf + File? sites_vcf_tbi + + Boolean run_dv_pepper_analysis + Int? dvp_threads + Int? dvp_memory + File? ref_scatter_interval_list_locator + File? ref_scatter_interval_list_ids + } + + parameter_meta { + fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed" + tandem_repeat_bed: "BED file containing TRF finder for better PBSV calls (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)" + minsvlen: "Minimum SV length in bp (default: 50)" + + call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria" + sites_vcf: "for use with Clair" + sites_vcf_tbi: "for use with Clair" + + run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)" + ref_scatter_interval_list_locator: "A file holding paths to interval_list files; needed only when running DV-Pepper" + ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; needed only when running DV-Pepper" + } + + ###################################################################### + # Block for small variants handling + ###################################################################### + + call Utils.RandomZoneSpewer as arbitrary {input: num_of_zones = 3} + + # todo: merge the two scattering scheme into a better one + if (call_small_variants) { + # Scatter by chromosome + Array[String] default_filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV'] + Array[String] use_filter = if (call_small_vars_on_mitochondria) then default_filter else flatten([['chrM'],default_filter]) + call Utils.MakeChrIntervalList as SmallVariantsScatterPrepp { + input: + ref_dict = ref_dict, + filter = use_filter + } + + scatter (c in SmallVariantsScatterPrepp.chrs) { + String contig_for_small_var = c[0] + + call Utils.SubsetBam as SmallVariantsScatter { + input: + bam = bam, + bai = bai, + locus = contig_for_small_var + } + + call Clair3.Clair { + input: + bam = SmallVariantsScatter.subset_bam, + bai = SmallVariantsScatter.subset_bai, + + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + + sites_vcf = sites_vcf, + sites_vcf_tbi = sites_vcf_tbi, + + preset = "CCS", + zones = arbitrary.zones + } + } + + call VariantUtils.MergeAndSortVCFs as MergeAndSortClairVCFs { + input: + vcfs = Clair.vcf, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix + ".clair" + } + + call VariantUtils.MergeAndSortVCFs as MergeAndSortClair_gVCFs { + input: + vcfs = Clair.gvcf, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix + ".clair.g" + } + + # size-balanced scatter + # todo: phasing isn't done for CCS data yet, waiting for Pepper Team to respond + if (run_dv_pepper_analysis) { + File scatter_interval_list_ids = select_first([ref_scatter_interval_list_ids]) + File scatter_interval_list_loc = select_first([ref_scatter_interval_list_locator]) + Array[String] interval_list_ids = read_lines(scatter_interval_list_ids) + Array[String] interval_list_files = read_lines(scatter_interval_list_loc) + Array[Pair[String, String]] ided_interval_list_files = zip(interval_list_ids, interval_list_files) + + scatter (pair in ided_interval_list_files) { + call Utils.ResilientSubsetBam as size_balanced_scatter { + input: + bam = bam, + bai = bai, + interval_list_file = pair.right, + interval_id = pair.left, + prefix = basename(bam, ".bam") + } + + call CCSPepper.CCSPepper { + input: + bam = size_balanced_scatter.subset_bam, + bai = size_balanced_scatter.subset_bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + + pepper_threads = select_first([dvp_threads]), + pepper_memory = select_first([dvp_memory]), + dv_threads = select_first([dvp_threads]), + dv_memory = select_first([dvp_memory]), + zones = arbitrary.zones + } + } + + String dvp_prefix = prefix + ".deepvariant_pepper" + + call VariantUtils.MergeAndSortVCFs as MergeDeepVariantGVCFs { + input: + vcfs = CCSPepper.gVCF, + prefix = dvp_prefix + ".g", + ref_fasta_fai = ref_fasta_fai + } + + # todo: phasing VCF could happen here, i.e. on gathered VCFs as that's going to be less intensive + call VariantUtils.MergeAndSortVCFs as MergeDeepVariantVCFs { + input: + vcfs = CCSPepper.VCF, + prefix = dvp_prefix, + ref_fasta_fai = ref_fasta_fai + } + + call Utils.MergeBams { + input: + bams = CCSPepper.hap_tagged_bam, + prefix = prefix + ".MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged" + } + + call CCSPepper.MarginPhase { + input: + bam = bam, + bai = bai, + unphased_vcf = MergeDeepVariantVCFs.vcf, + unphased_vcf_tbi = MergeDeepVariantVCFs.tbi, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + memory = select_first([dvp_memory, 64]), + zones = arbitrary.zones + } + } + } + + ###################################################################### + # Block for SV handling + ###################################################################### + if (call_svs) { + if (fast_less_sensitive_sv) { + + call Utils.MakeChrIntervalList { + input: + ref_dict = ref_dict, + filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV'] + } + + scatter (c in MakeChrIntervalList.chrs) { + String contig_for_sv = c[0] + + call Utils.SubsetBam { + input: + bam = bam, + bai = bai, + locus = contig_for_sv + } + + call PBSV.RunPBSV { + input: + bam = SubsetBam.subset_bam, + bai = SubsetBam.subset_bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix, + tandem_repeat_bed = tandem_repeat_bed, + is_ccs = true, + zones = arbitrary.zones + } + + } + + call VariantUtils.MergePerChrCalls as MergePBSVVCFs { + input: + vcfs = RunPBSV.vcf, + ref_dict = ref_dict, + prefix = prefix + ".pbsv" + } + + } + + if (!fast_less_sensitive_sv) { + call PBSV.RunPBSV as PBSVslow { + input: + bam = bam, + bai = bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix, + tandem_repeat_bed = tandem_repeat_bed, + is_ccs = true, + zones = arbitrary.zones + } + + call VariantUtils.ZipAndIndexVCF as ZipAndIndexPBSV {input: vcf = PBSVslow.vcf } + } + + call Sniffles2.SampleSV as Sniffles2SV { + input: + bam = bam, + bai = bai, + minsvlen = minsvlen, + sample_id = sample_id, + prefix = prefix + } + + call VariantUtils.ZipAndIndexVCF as ZipAndIndexSnifflesVCF { + input: + vcf = Sniffles2SV.vcf + } + } + + output { + File? sniffles_vcf = ZipAndIndexSnifflesVCF.vcfgz + File? sniffles_tbi = ZipAndIndexSnifflesVCF.tbi + File? sniffles_snf = Sniffles2SV.snf + File? pbsv_vcf = select_first([MergePBSVVCFs.vcf, ZipAndIndexPBSV.vcfgz]) + File? pbsv_tbi = select_first([MergePBSVVCFs.tbi, ZipAndIndexPBSV.tbi]) + + File? clair_vcf = MergeAndSortClairVCFs.vcf + File? clair_tbi = MergeAndSortClairVCFs.tbi + + File? clair_gvcf = MergeAndSortClair_gVCFs.vcf + File? clair_gtbi = MergeAndSortClair_gVCFs.tbi + + File? dvp_g_vcf = MergeDeepVariantGVCFs.vcf + File? dvp_g_tbi = MergeDeepVariantGVCFs.tbi + File? dvp_vcf = MergeDeepVariantVCFs.vcf + File? dvp_tbi = MergeDeepVariantVCFs.tbi + File? dvp_phased_vcf = MarginPhase.phasedVCF + File? dvp_phased_tbi = MarginPhase.phasedtbi + } +} diff --git a/wdl/lib/VariantCalling/CallVariantsPBCLR.wdl b/wdl/lib/VariantCalling/CallVariantsPBCLR.wdl new file mode 100644 index 0000000..f3f3213 --- /dev/null +++ b/wdl/lib/VariantCalling/CallVariantsPBCLR.wdl @@ -0,0 +1,168 @@ +version 1.0 + +########################################################################################## +# This pipeline calls SVs on an input LR BAM using various known SV algorithms +# that are specifically designed to work with long read data. +# Each individual task/algo. is directly callable, if so desired. +########################################################################################## + +import "../Utility/Utils.wdl" +import "../Utility/VariantUtils.wdl" + +import "PBSV.wdl" +import "Sniffles.wdl" + + +workflow CallVariants { + meta { + descrition: "A workflow for calling small and/or structural variants from an aligned CLR BAM file." + } + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + Boolean call_svs + Boolean fast_less_sensitive_sv + File? tandem_repeat_bed + + Boolean call_small_variants = false + Boolean call_small_vars_on_mitochondria + } + + parameter_meta { + bam: "input BAM from which to call SVs" + bai: "index accompanying the BAM" + + ref_fasta: "reference to which the BAM was aligned to" + ref_fasta_fai: "index accompanying the reference" + ref_dict: "sequence dictionary accompanying the reference" + + call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria" + sites_vcf: "for use with Clair, the small variant caller" + sites_vcf_tbi: "for use with Clair, the small variant caller" + + prefix: "prefix for output files" + + tandem_repeat_bed: "BED file containing TRF finder (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)" + } + + call Utils.RandomZoneSpewer as arbitrary {input: num_of_zones = 3} + + ###################################################################### + # Block for small variants handling + ###################################################################### + # todo: use NanoCaller, Clair isn't going to support CLR + ###################################################################### + # Block for SV handling + ###################################################################### + if (call_svs) { + if (fast_less_sensitive_sv) { + + call Utils.MakeChrIntervalList { + input: + ref_dict = ref_dict, + filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV'] + } + + scatter (c in MakeChrIntervalList.chrs) { + String contig_for_sv = c[0] + + call Utils.SubsetBam { + input: + bam = bam, + bai = bai, + locus = contig_for_sv + } + + call PBSV.RunPBSV { + input: + bam = SubsetBam.subset_bam, + bai = SubsetBam.subset_bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix, + tandem_repeat_bed = tandem_repeat_bed, + is_ccs = false, + zones = arbitrary.zones + } + + call Sniffles.Sniffles { + input: + bam = SubsetBam.subset_bam, + bai = SubsetBam.subset_bai, + chr = contig_for_sv, + prefix = prefix + } + + call Utils.InferSampleName { + input: + bam = SubsetBam.subset_bam, + bai = SubsetBam.subset_bai + } + call VariantUtils.FixSnifflesVCF { + input: + vcf = Sniffles.vcf, + sample_name = InferSampleName.sample_name + } + } + + call VariantUtils.MergePerChrCalls as MergePBSVVCFs { + input: + vcfs = RunPBSV.vcf, + ref_dict = ref_dict, + prefix = prefix + ".pbsv" + } + + call VariantUtils.CollectDefinitions as UnionHeadersSnifflesVCFs { + input: + vcfs = FixSnifflesVCF.sortedVCF + } + call VariantUtils.MergeAndSortVCFs as MergeSnifflesVCFs { + input: + vcfs = FixSnifflesVCF.sortedVCF, + ref_fasta_fai = ref_fasta_fai, + header_definitions_file = UnionHeadersSnifflesVCFs.union_definitions, + prefix = prefix + ".sniffles" + } + } + + if (!fast_less_sensitive_sv) { + + call PBSV.RunPBSV as PBSVslow { + input: + bam = bam, + bai = bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + prefix = prefix, + tandem_repeat_bed = tandem_repeat_bed, + is_ccs = false, + zones = arbitrary.zones + } + call VariantUtils.ZipAndIndexVCF as ZipAndIndexPBSV {input: vcf = PBSVslow.vcf } + + call Sniffles.Sniffles as SnifflesSlow { + input: + bam = bam, + bai = bai, + prefix = prefix + } + call Utils.InferSampleName as infer {input: bam = bam, bai = bai} + call VariantUtils.FixSnifflesVCF as ZipAndIndexSniffles {input: vcf = SnifflesSlow.vcf, sample_name = infer.sample_name} + } + } + + output { + File? sniffles_vcf = select_first([MergeSnifflesVCFs.vcf, ZipAndIndexSniffles.sortedVCF]) + File? sniffles_tbi = select_first([MergeSnifflesVCFs.tbi, ZipAndIndexSniffles.tbi]) + + File? pbsv_vcf = select_first([MergePBSVVCFs.vcf, ZipAndIndexPBSV.vcfgz]) + File? pbsv_tbi = select_first([MergePBSVVCFs.tbi, ZipAndIndexPBSV.tbi]) + } +} diff --git a/wdl/lib/VariantCalling/Clair.wdl b/wdl/lib/VariantCalling/Clair.wdl new file mode 100644 index 0000000..307bed5 --- /dev/null +++ b/wdl/lib/VariantCalling/Clair.wdl @@ -0,0 +1,104 @@ +version 1.0 + +####################################################### +# This pipeline calls small variants using DeepVariant. +####################################################### + +import "../../structs/Structs.wdl" + +task Clair { + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + File? sites_vcf + File? sites_vcf_tbi + + String? chr + String preset + + String zones + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "input BAM from which to call variants" + bai: "index accompanying the BAM" + + ref_fasta: "reference to which the BAM was aligned to" + ref_fasta_fai: "index accompanying the reference" + + sites_vcf: "sites VCF" + sites_vcf_tbi: "sites VCF index" + + chr: "chr on which to call variants" + preset: "calling preset (CCS, ONT)" + } + + Int disk_size = 10*ceil(size(select_all([bam, bai, ref_fasta, ref_fasta_fai, sites_vcf]), "GB")) + String platform = if preset == "CCS" then "hifi" else "ont" + + command <<< + # avoid the infamous pipefail 141 https://stackoverflow.com/questions/19120263 + set -eux + SM=$(samtools view -H ~{bam} | grep -m1 '^@RG' | sed 's/\t/\n/g' | grep '^SM:' | sed 's/SM://g') + + # example from https://github.com/HKU-BAL/Clair3#option-1--docker-pre-built-image + set -euxo pipefail + + touch ~{bai} + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + # --include_all_ctgs is turned on, as scatter-gather chops bam before Clair + /opt/bin/run_clair3.sh ~{true='--vcf_fn=' false='' defined(sites_vcf)}~{select_first([sites_vcf, ""])} \ + --bam_fn=~{bam} \ + --ref_fn=~{ref_fasta} \ + --threads=${num_core} \ + --platform=~{platform} \ + --model_path="/opt/models/~{platform}" \ + --sample_name=$SM --gvcf ~{true='--ctg_name=' false='' defined(chr)}~{select_first([chr, "--include_all_ctgs"])} \ + --output="./" + + # for chrM, Clair3 creates a header only vcf, copy it to gVCF as-is + if [[ ! -f merge_output.gvcf.gz ]]; then cp "merge_output.vcf.gz" "merge_output.gvcf.gz"; fi + >>> + + output { + File? pileup_vcf = "pileup.vcf.gz" + File? pileup_vcf_tbi = "pileup.vcf.gz.tbi" + File? full_alignment_vcf = "full_alignment.vcf.gz" + File? full_alignment_tbi = "full_alignment.vcf.gz.tbi" + + # save both VCF and gVCF + File vcf = "merge_output.vcf.gz" + File? vcf_tbi = "merge_output.vcf.gz.tbi" + File gvcf = "merge_output.gvcf.gz" + File? gvcf_tbi = "merge_output.gvcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 36, + mem_gb: 72, + disk_gb: disk_size, + boot_disk_gb: 100, + preemptible_tries: 0, + max_retries: 0, + docker: "hkubal/clair3:v0.1-r6" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/VariantCalling/GLNexus.wdl b/wdl/lib/VariantCalling/GLNexus.wdl new file mode 100644 index 0000000..036ec70 --- /dev/null +++ b/wdl/lib/VariantCalling/GLNexus.wdl @@ -0,0 +1,363 @@ +version 1.0 + +########################################################################################## +# This pipeline joint-calls GVCFs with GLNexus (https://github.com/dnanexus-rnd/GLnexus). +# It also permits intervals to be specified so that joint calling only takes place on a +# subset of intervals (this can be useful for finding duplicate samples). +########################################################################################## + +import "../Utility/Utils.wdl" +import "../Utility/VariantUtils.wdl" + +workflow JointCall { + input { + Array[File] gvcfs + Array[File] tbis + + File dict + File? bed + + String config = "DeepVariantWGS" + Boolean more_PL = false + Boolean squeeze = false + Boolean trim_uncalled_alleles = false + + Int? num_cpus + Int max_cpus = 64 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gvcfs: "gVCF files to perform joint calling upon" + tbis: "gVCF index files" + dict: "reference sequence dictionary" + bed: "intervals to which joint calling should be restricted" + + config: "configuration preset name or .yml filename" + more_PL: "include PL from reference bands and other cases omitted by default" + squeeze: "reduce pVCF size by suppressing detail in cells derived from reference bands" + trim_uncalled_alleles: "remove alleles with no output GT calls in postprocessing" + + num_cpus: "number of CPUs to use" + max_cpus: "maximum number of CPUs to allow" + prefix: "output prefix for joined-called BCF and GVCF files" + } + + Int cpus_exp = if defined(num_cpus) then select_first([num_cpus]) else 2*length(gvcfs) + Int cpus_act = if cpus_exp < max_cpus then cpus_exp else max_cpus + + # List all of the contigs in the reference + call GetRanges { input: dict = dict, bed = bed } + + # Shard all gVCFs into per-contig shards + scatter (p in zip(gvcfs, tbis)) { + call ShardVCFByRanges { input: gvcf = p.left, tbi = p.right, ranges = GetRanges.ranges } + } + + # Joint-call in parallel over chromosomes + scatter (i in range(length(ShardVCFByRanges.sharded_gvcfs[0]))) { + Array[File] per_contig_gvcfs = transpose(ShardVCFByRanges.sharded_gvcfs)[i] + + call Call { + input: + gvcfs = per_contig_gvcfs, + + config = config, + more_PL = more_PL, + squeeze = squeeze, + trim_uncalled_alleles = trim_uncalled_alleles, + + num_cpus = cpus_act, + prefix = prefix + } + } + + # Concatenate the contig-sharded joint calls into a single joint callset + call ConcatBCFs { input: bcfs = Call.joint_bcf, prefix = prefix } + + output { + File joint_gvcf = ConcatBCFs.joint_gvcf + File joint_gvcf_tbi = ConcatBCFs.joint_gvcf_tbi + } +} + +task GetRanges { + meta { + description: "Select loci over which to parallelize downstream operations." + } + + input { + File dict + File? bed + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + ceil(size(dict, "GB")) + + command <<< + set -euxo pipefail + + if [[ "~{defined(bed)}" == "true" ]]; then + cat ~{bed} | awk '{ print $1 ":" $2 "-" $3 }' > ranges.txt + else + grep '^@SQ' ~{dict} | \ + awk '{ print $2, $3 }' | \ + sed 's/[SL]N://g' | \ + awk '{ print $1 ":0-" $2 }' \ + > ranges.txt + fi + >>> + + output { + Array[String] ranges = read_lines("ranges.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ShardVCFByRanges { + meta { + description: "Split VCF into smaller ranges for parallelization." + } + + input { + File gvcf + File tbi + Array[String] ranges + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(gvcf, "GB")) + + command <<< + set -euxo pipefail + + mkdir per_contig + + INDEX=0 + for RANGE in ~{sep=' ' ranges} + do + PINDEX=$(printf "%06d" $INDEX) + FRANGE=$(echo $RANGE | sed 's/[:-]/___/g') + OUTFILE="per_contig/$PINDEX.~{basename(gvcf, ".g.vcf.gz")}.locus_$FRANGE.g.vcf.gz" + + bcftools view ~{gvcf} $RANGE | bgzip > $OUTFILE + + INDEX=$(($INDEX+1)) + done + >>> + + output { + Array[File] sharded_gvcfs = glob("per_contig/*") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Call { + meta { + description: "Joint-call gVCFs with GLNexus." + } + + input { + Array[File] gvcfs + + String config = "DeepVariantWGS" + Boolean more_PL = false + Boolean squeeze = false + Boolean trim_uncalled_alleles = false + + Int num_cpus = 96 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 5*ceil(size(gvcfs, "GB")) + Int mem = 4*num_cpus + + command <<< + set -x + + # For guidance on performance settings, see https://github.com/dnanexus-rnd/GLnexus/wiki/Performance + ulimit -Sn 65536 + + echo ~{gvcfs[0]} | sed 's/.*locus_//' | sed 's/.g.vcf.bgz//' | sed 's/___/\t/g' > range.bed + + glnexus_cli \ + --config ~{config} \ + --bed range.bed \ + ~{if more_PL then "--more-PL" else ""} \ + ~{if squeeze then "--squeeze" else ""} \ + ~{if trim_uncalled_alleles then "--trim-uncalled-alleles" else ""} \ + --list ~{write_lines(gvcfs)} \ + > ~{prefix}.bcf + >>> + + output { + File joint_bcf = "~{prefix}.bcf" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: mem, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CompressAndIndex { + meta { + description: "Convert a BCF file to a vcf.bgz file and index it." + } + + input { + File joint_bcf + + Int num_cpus = 8 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 3*ceil(size(joint_bcf, "GB")) + + command <<< + set -x + + bcftools view ~{joint_bcf} | bgzip -@ ~{num_cpus} -c > ~{prefix}.g.vcf.bgz + tabix -p vcf ~{prefix}.g.vcf.bgz + >>> + + output { + File joint_gvcf = "~{prefix}.g.vcf.bgz" + File joint_gvcf_tbi = "~{prefix}.g.vcf.bgz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 4*num_cpus, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ConcatBCFs { + meta { + description: "Concatenate BCFs into a single .vcf.bgz file and index it." + } + + input { + Array[File] bcfs + + Int num_cpus = 4 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(bcfs, "GB")) + + command <<< + set -euxo pipefail + + bcftools concat -n ~{sep=' ' bcfs} | bcftools view | bgzip -@ ~{num_cpus} -c > ~{prefix}.g.vcf.bgz + tabix -p vcf ~{prefix}.g.vcf.bgz + >>> + + output { + File joint_gvcf = "~{prefix}.g.vcf.bgz" + File joint_gvcf_tbi = "~{prefix}.g.vcf.bgz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/VariantCalling/ONTPepper.wdl b/wdl/lib/VariantCalling/ONTPepper.wdl new file mode 100644 index 0000000..57a0f04 --- /dev/null +++ b/wdl/lib/VariantCalling/ONTPepper.wdl @@ -0,0 +1,123 @@ +version 1.0 + +####################################################### +# This pipeline calls small variants using DeepVariant. +####################################################### + +import "../../structs/Structs.wdl" + +task Pepper { + + meta { + description: "A 1-stop shop task offered by Pepper for ONT data." + } + + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + Int threads + Int memory + + String zones = "us-central1-b us-central1-c" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + # when running large scale workflows, we sometimes see errors like the following + # A resource limit has delayed the operation: generic::resource_exhausted: allocating: selecting resources: selecting region and zone: + # no available zones: 2763 LOCAL_SSD_TOTAL_GB (738/30000 available) usage too high + zones: "select which zone (GCP) to run this task" + } + + Int bam_sz = ceil(size(bam, "GB")) + Boolean is_big_bam = bam_sz > 100 + Int inflation_factor = if (is_big_bam) then 10 else 5 + Int minimal_disk = 1000 + Int disk_size = if inflation_factor * bam_sz > minimal_disk then inflation_factor * bam_sz else minimal_disk + + String output_root = "/cromwell_root/pepper_output" + + String prefix = basename(bam, ".bam") + ".deepvariant_pepper" + + command <<< + # avoid the infamous pipefail 141 https://stackoverflow.com/questions/19120263 + set -eux + SM=$(samtools view -H ~{bam} | grep -m1 '^@RG' | sed 's/\t/\n/g' | grep '^SM:' | sed 's/SM://g') + + set -euxo pipefail + + touch ~{bai} + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" + + run_pepper_margin_deepvariant \ + call_variant \ + -b ~{bam} \ + -f ~{ref_fasta} \ + -t "${num_core}" \ + -s "${SM}" \ + -o "~{output_root}" \ + -p "~{prefix}" \ + --gvcf \ + --phased_output \ + --ont + + df -h . + find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ + > "~{output_root}/dir_structure.txt" + + if [[ -f "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" ]]; then + mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" \ + "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" + mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" \ + "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" + fi + >>> + + output { + File VCF = "~{output_root}/~{prefix}.vcf.gz" + File VCF_tbi = "~{output_root}/~{prefix}.vcf.gz.tbi" + + File gVCF = "~{output_root}/~{prefix}.g.vcf.gz" + File gVCF_tbi = "~{output_root}/~{prefix}.g.vcf.gz.tbi" + + File phasedVCF = "~{output_root}/~{prefix}.phased.vcf.gz" + File phasedtbi = "~{output_root}/~{prefix}.phased.vcf.gz.tbi" + + File hap_tagged_bam = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" + File hap_tagged_bai = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" + + # maybe less useful + File output_dir_structure = "~{output_root}/dir_structure.txt" + File phaseset_bed = "~{output_root}/~{prefix}.phaseset.bed" + File visual_report_html = "~{output_root}/~{prefix}.visual_report.html" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: threads, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 100, + preemptible_tries: 1, + max_retries: 1, + docker: "kishwars/pepper_deepvariant:r0.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/lib/VariantCalling/PBSV.wdl b/wdl/lib/VariantCalling/PBSV.wdl new file mode 100644 index 0000000..8120ca9 --- /dev/null +++ b/wdl/lib/VariantCalling/PBSV.wdl @@ -0,0 +1,177 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +workflow RunPBSV { + input { + File bam + File bai + Boolean is_ccs + + File ref_fasta + File ref_fasta_fai + String prefix + + String zones + + File? tandem_repeat_bed + } + + parameter_meta { + is_ccs: "if input BAM is CCS reads" + } + + call Discover { + input: + bam = bam, + bai = bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + tandem_repeat_bed = tandem_repeat_bed, + prefix = prefix, + zones = zones + } + + call Call { + input: + svsigs = [ Discover.svsig ], + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + ccs = is_ccs, + prefix = prefix, + zones = zones + } + + output { + File vcf = Call.vcf + } +} + +task Discover { + input { + File bam + File bai + File ref_fasta + File ref_fasta_fai + File? tandem_repeat_bed + String? chr + String prefix + String zones + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "input BAM from which to call SVs" + bai: "index accompanying the BAM" + ref_fasta: "reference to which the BAM was aligned to" + ref_fasta_fai: "index accompanying the reference" + tandem_repeat_bed: "BED file containing TRF finder (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)" + chr: "chr on which to call variants" + prefix: "prefix for output" + } + + Int MINIMAL_DISK = 500 + Boolean is_big_bam = size(bam, "GB") > 100 + Int inflation_factor = if (is_big_bam) then 5 else 2 + Int disk_size = inflation_factor * (ceil(size([bam, bai, ref_fasta, ref_fasta_fai], "GB")) + 1) + Int runtime_disk_size = if disk_size < MINIMAL_DISK then MINIMAL_DISK else disk_size + + String fileoutput = if defined(chr) then "~{prefix}.~{chr}.svsig.gz" else "~{prefix}.svsig.gz" + + command <<< + set -euxo pipefail + + pbsv discover \ + ~{if defined(tandem_repeat_bed) then "--tandem-repeats ~{tandem_repeat_bed}" else ""} \ + ~{bam} \ + ~{fileoutput} + >>> + + output { + File svsig = "~{fileoutput}" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: if(defined(chr)) then 8 else 32, + mem_gb: if(defined(chr)) then 32 else 128, + disk_gb: runtime_disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-sv:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Call { + input { + Array[File] svsigs + File ref_fasta + File ref_fasta_fai + Boolean ccs + String prefix + String zones + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + svsigs: "per-chromosome *.svsig.gz files" + ref_fasta: "reference to which the BAM was aligned to" + ref_fasta_fai: "index accompanying the reference" + ccs: "use optimizations for CCS data" + prefix: "prefix for output" + } + + Int disk_size = 2*ceil(size(svsigs, "GiB") + size([ref_fasta, ref_fasta_fai], "GiB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + pbsv call -j $num_core --log-level INFO ~{true='--ccs' false='' ccs} \ + ~{ref_fasta} \ + ~{sep=' ' svsigs} \ + ~{prefix}.pbsv.pre.vcf + + cat ~{prefix}.pbsv.pre.vcf | grep -v -e '##fileDate' > ~{prefix}.pbsv.vcf + >>> + + output { + File vcf = "~{prefix}.pbsv.vcf" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 96, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-sv:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + zones: zones + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + diff --git a/wdl/lib/VariantCalling/Sniffles.wdl b/wdl/lib/VariantCalling/Sniffles.wdl new file mode 100644 index 0000000..a98a02d --- /dev/null +++ b/wdl/lib/VariantCalling/Sniffles.wdl @@ -0,0 +1,77 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +# Given BAM, call SVs using Sniffles +task Sniffles { + input { + File bam + File bai + Int min_read_support = 2 + Int min_read_length = 1000 + Int min_mq = 20 + String? chr + String prefix + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "input BAM from which to call SVs" + bai: "index accompanying the BAM" + min_read_support: "[default-valued] minimum reads required to make a call" + min_read_length: "[default-valued] filter out reads below minimum read length" + min_mq: "[default-valued] minimum mapping quality to accept" + chr: "chr on which to call variants" + prefix: "prefix for output" + } + + Int cpus = 8 + Int disk_size = 2*ceil(size([bam, bai], "GB")) + String fileoutput = if defined(chr) then "~{prefix}.~{chr}.sniffles.vcf" else "~{prefix}.sniffles.vcf" + + command <<< + set -x + + sniffles -t ~{cpus} \ + -m ~{bam} \ + -v ~{fileoutput}\ + -s ~{min_read_support} \ + -r ~{min_read_length} \ + -q ~{min_mq} \ + --num_reads_report -1 \ + --genotype + + touch ~{prefix}.~{fileoutput} + + cat ~{prefix}.~{fileoutput}| \ + grep -v -e '##fileDate' | \ + awk '{ if ($1 ~ "^#" || $7 == "PASS") print $0 }' \ + > ~{prefix}.~{fileoutput} + >>> + + output { + File vcf = "~{fileoutput}" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: cpus, + mem_gb: 46, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-sv:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + diff --git a/wdl/lib/VariantCalling/Sniffles2.wdl b/wdl/lib/VariantCalling/Sniffles2.wdl new file mode 100644 index 0000000..734c94f --- /dev/null +++ b/wdl/lib/VariantCalling/Sniffles2.wdl @@ -0,0 +1,177 @@ +version 1.0 + + +import "../../structs/Structs.wdl" + + +workflow Sniffles2 { + + meta { + description: "This workflow calls SV candidates using Sniffles2 population mode." + } + + input { + Array[File] sampleBAMs + Array[File] sampleBAIs + Array[String] sampleIDs + String prefix + Int minsvlen = 50 + } + + parameter_meta { + # input + sampleBAMs: "GCS paths to aligned BAM files from multiple samples" + sampleBAIs: "GCS paths to aligned BAM files indices from multiple samples" + sampleIDs: "matching sample IDs of the BAMs" + minsvlen: "Minimum SV length in bp" + prefix: "prefix for output files" + # output + single_snf: ".snf output containing SV candidates from a single sample" + multisample_vcf: "Multi-sample vcf output" + } + + scatter (i in range(length(sampleBAMs))) { + call SampleSV { + input: + bam = sampleBAMs[i], + bai = sampleBAIs[i], + minsvlen = minsvlen, + prefix = prefix, + sample_id = sampleIDs[i] + } + } + + call MergeCall { + input: + snfs = SampleSV.snf, + prefix = prefix + } + + + output { + Array[File] single_snf = SampleSV.snf + File multisample_vcf = MergeCall.vcf + } +} + + + +task SampleSV { + + meta { + description: "This task calls SV candidates from a single sample." + } + + input { + File bam + File bai + Int minsvlen = 50 + String sample_id + String prefix + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "input BAM from which to call SVs" + bai: "index accompanying the BAM" + minsvlen: "minimum SV length in bp. Default 50" + sample_id: "Sample ID" + prefix: "prefix for output" + } + + Int cpus = 8 + Int disk_size = 2*ceil(size([bam, bai], "GB")) + String snf_output = "~{prefix}.sniffles.snf" + String vcf_output = "~{prefix}.sniffles.vcf" + + command <<< + set -eux + + sniffles -t ~{cpus} \ + -i ~{bam} \ + --minsvlen ~{minsvlen} \ + --sample-id ~{sample_id} \ + --vcf ~{vcf_output} \ + --snf ~{snf_output} + tree + >>> + + output { + File snf = "~{snf_output}" + File vcf = "~{vcf_output}" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: cpus, + mem_gb: 46, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-sniffles2:2.0.6" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +task MergeCall { + + meta { + description: "This tasks performs joined-calling from multiple .snf files and produces a single .vcf" + } + + input { + Array[File] snfs + String prefix + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + snfs: ".snf files" + } + + command <<< + set -eux + sniffles --input ~{sep=" " snfs} \ + --vcf multisample.vcf + tree + >>> + + output { + File vcf = "~{prefix}.vcf" + } + + Int cpus = 8 + Int disk_size = 3*ceil(size(snfs, "GB")) + ######################### + RuntimeAttr default_attr = object { + cpu_cores: cpus, + mem_gb: 46, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-sniffles2:2.0.6" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + +} \ No newline at end of file diff --git a/wdl/lib/Visualization/NanoPlot.wdl b/wdl/lib/Visualization/NanoPlot.wdl new file mode 100644 index 0000000..f486b1f --- /dev/null +++ b/wdl/lib/Visualization/NanoPlot.wdl @@ -0,0 +1,334 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task NanoPlotFromSummary { + input { + Array[File] summary_files + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(summary_files, "GB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + NanoPlot -t ${num_core} \ + -c orangered \ + --N50 \ + --tsv_stats \ + --summary "~{sep=' ' summary_files}" + + cat NanoStats.txt | \ + grep -v -e '^Metrics' -e '^highest' -e '^longest' | \ + sed 's/ >/_/' | \ + sed 's/://' | \ + awk '{ print $1 "\t" $2 }' | \ + tee map.txt + >>> + + #number_of_reads 88000 + #number_of_bases 467855516.0 + #median_read_length 4086.0 + #mean_read_length 5316.5 + #read_length_stdev 4413.2 + #n50 6731.0 + #active_channels 506 + #mean_qual 12.8 + #median_qual 13.7 + #Reads_Q5 85483 + #Reads_Q7 80249 + #Reads_Q10 71810 + #Reads_Q12 59097 + #Reads_Q15 26597 + + output { + File stats = "NanoStats.txt" + Map[String, Float] stats_map = read_map("map.txt") + + Array[File] plots = glob("*.png") + File ActivePores_Over_Time = "ActivePores_Over_Time.png" + File ActivityMap_ReadsPerChannel = "ActivityMap_ReadsPerChannel.png" + File CumulativeYieldPlot_Gigabases = "CumulativeYieldPlot_Gigabases.png" + File CumulativeYieldPlot_NumberOfReads = "CumulativeYieldPlot_NumberOfReads.png" + File LengthvsQualityScatterPlot_dot = "LengthvsQualityScatterPlot_dot.png" + File LengthvsQualityScatterPlot_kde = "LengthvsQualityScatterPlot_kde.png" + File Non_weightedHistogramReadlength = "Non_weightedHistogramReadlength.png" + File Non_weightedLogTransformed_HistogramReadlength = "Non_weightedLogTransformed_HistogramReadlength.png" + File NumberOfReads_Over_Time = "NumberOfReads_Over_Time.png" + File TimeLengthViolinPlot = "TimeLengthViolinPlot.png" + File TimeQualityViolinPlot = "TimeQualityViolinPlot.png" + File WeightedHistogramReadlength = "WeightedHistogramReadlength.png" + File WeightedLogTransformed_HistogramReadlength = "WeightedLogTransformed_HistogramReadlength.png" + File Yield_By_Length = "Yield_By_Length.png" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "quay.io/biocontainers/nanoplot:1.35.5--pyhdfd78af_0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task NanoPlotFromRichFastqs { + input { + Array[File] fastqs + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(fastqs, "GB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + NanoPlot -t ${num_core} \ + -c orangered \ + --N50 \ + --tsv_stats \ + --fastq_rich ~{sep=' ' fastqs} + + cat NanoStats.txt | \ + grep -v -e '^Metrics' -e '^highest' -e '^longest' | \ + sed 's/ >/_/' | \ + sed 's/://' | \ + awk '{ print $1 "\t" $2 }' | \ + tee map.txt + >>> + + output { + File stats = "NanoStats.txt" + Map[String, Float] stats_map = read_map("map.txt") + + Array[File] plots = glob("*.png") + File ActivePores_Over_Time = "ActivePores_Over_Time.png" + File ActivityMap_ReadsPerChannel = "ActivityMap_ReadsPerChannel.png" + File CumulativeYieldPlot_Gigabases = "CumulativeYieldPlot_Gigabases.png" + File CumulativeYieldPlot_NumberOfReads = "CumulativeYieldPlot_NumberOfReads.png" + File LengthvsQualityScatterPlot_dot = "LengthvsQualityScatterPlot_dot.png" + File LengthvsQualityScatterPlot_kde = "LengthvsQualityScatterPlot_kde.png" + File Non_weightedHistogramReadlength = "Non_weightedHistogramReadlength.png" + File Non_weightedLogTransformed_HistogramReadlength = "Non_weightedLogTransformed_HistogramReadlength.png" + File NumberOfReads_Over_Time = "NumberOfReads_Over_Time.png" + File TimeLengthViolinPlot = "TimeLengthViolinPlot.png" + File TimeQualityViolinPlot = "TimeQualityViolinPlot.png" + File WeightedHistogramReadlength = "WeightedHistogramReadlength.png" + File WeightedLogTransformed_HistogramReadlength = "WeightedLogTransformed_HistogramReadlength.png" + File Yield_By_Length = "Yield_By_Length.png" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "quay.io/biocontainers/nanoplot:1.35.5--pyhdfd78af_0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task NanoPlotFromBam { + input { + File bam + File bai + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB")) + 10 + + command <<< + set -euxo pipefail + + touch ~{bai} # avoid the warning bai is older than bam + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + NanoPlot -t ${num_core} \ + -c orangered \ + --N50 \ + --tsv_stats \ + --no_supplementary \ + --verbose \ + --bam "~{bam}" + + cat NanoStats.txt | \ + grep -v -e '^Metrics' -e '^highest' -e '^longest' | \ + sed 's/ >/_/' | \ + sed 's/://' | \ + awk '{ print $1 "\t" $2 }' | \ + tee map.txt + >>> + + #number_of_reads 143488 + #number_of_bases 993469297.0 + #number_of_bases_aligned 402067275.0 + #fraction_bases_aligned 0.4 + #median_read_length 5081.0 + #mean_read_length 6923.7 + #read_length_stdev 6116.7 + #n50 9210.0 + #average_identity 92.8 + #median_identity 94.5 + #mean_qual 14.6 + #median_qual 15.0 + #Reads_Q5 143488 + #Reads_Q7 143488 + #Reads_Q10 140551 + #Reads_Q12 119386 + #Reads_Q15 71164 + + output { + File stats = "NanoStats.txt" + Map[String, Float] stats_map = read_map("map.txt") + + Array[File] plots = glob("*.png") +# File AlignedReadlengthvsSequencedReadLength_dot = "AlignedReadlengthvsSequencedReadLength_dot.png" +# File AlignedReadlengthvsSequencedReadLength_kde = "AlignedReadlengthvsSequencedReadLength_kde.png" +# File LengthvsQualityScatterPlot_dot = "LengthvsQualityScatterPlot_dot.png" +# File LengthvsQualityScatterPlot_kde = "LengthvsQualityScatterPlot_kde.png" +# File MappingQualityvsAverageBaseQuality_dot = "MappingQualityvsAverageBaseQuality_dot.png" +# File MappingQualityvsAverageBaseQuality_kde = "MappingQualityvsAverageBaseQuality_kde.png" +# File MappingQualityvsReadLength_dot = "MappingQualityvsReadLength_dot.png" +# File MappingQualityvsReadLength_kde = "MappingQualityvsReadLength_kde.png" +# File Non_weightedHistogramReadlength = "Non_weightedHistogramReadlength.png" +# File Non_weightedLogTransformed_HistogramReadlength = "Non_weightedLogTransformed_HistogramReadlength.png" +# File PercentIdentityHistogramDynamic_Histogram_percent_identity = "PercentIdentityHistogramDynamic_Histogram_percent_identity.png" +# File PercentIdentityvsAlignedReadLength_dot = "PercentIdentityvsAlignedReadLength_dot.png" +# File PercentIdentityvsAlignedReadLength_kde = "PercentIdentityvsAlignedReadLength_kde.png" +# File PercentIdentityvsAverageBaseQuality_dot = "PercentIdentityvsAverageBaseQuality_dot.png" +# File PercentIdentityvsAverageBaseQuality_kde = "PercentIdentityvsAverageBaseQuality_kde.png" +# File WeightedHistogramReadlength = "WeightedHistogramReadlength.png" +# File WeightedLogTransformed_HistogramReadlength = "WeightedLogTransformed_HistogramReadlength.png" +# File Yield_By_Length = "Yield_By_Length.png" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 8, + mem_gb: 24, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-nanoplot:1.40.0-1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task NanoPlotFromUBam { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + NanoPlot -t ${num_core} \ + -c orangered \ + --N50 \ + --tsv_stats \ + --ubam "~{bam}" + + cat NanoStats.txt | \ + grep -v -e '^Metrics' -e '^highest' -e '^longest' | \ + sed 's/ >/_/' | \ + sed 's/://' | \ + awk '{ print $1 "\t" $2 }' | \ + tee map.txt + >>> + + #number_of_reads 991 + #number_of_bases 12949457.0 + #median_read_length 13705.0 + #mean_read_length 13067.1 + #read_length_stdev 9581.3 + #n50 18618.0 + #mean_qual 0.0 + #median_qual 0.0 + #Reads_Q5 0 + #Reads_Q7 0 + #Reads_Q10 0 + #Reads_Q12 0 + #Reads_Q15 0 + + output { + File stats = "NanoStats.txt" + Map[String, Float] stats_map = read_map("map.txt") + + Array[File] plots = glob("*.png") + File Non_weightedHistogramReadlength = "Non_weightedHistogramReadlength.png" + File Non_weightedLogTransformed_HistogramReadlength = "Non_weightedLogTransformed_HistogramReadlength.png" + File WeightedHistogramReadlength = "WeightedHistogramReadlength.png" + File WeightedLogTransformed_HistogramReadlength = "WeightedLogTransformed_HistogramReadlength.png" + File Yield_By_Length = "Yield_By_Length.png" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "quay.io/biocontainers/nanoplot:1.35.5--pyhdfd78af_0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/pipelines/HelloWorkflow.wdl b/wdl/pipelines/HelloWorkflow.wdl new file mode 100644 index 0000000..c271344 --- /dev/null +++ b/wdl/pipelines/HelloWorkflow.wdl @@ -0,0 +1,29 @@ +version 1.0 + +import "../tasks/HelloTask.wdl" as Hello +import "../lib/Utility/Utils.wdl" + +workflow HelloWorkflow { + meta { + description: "Example workflow." + } + + parameter_meta { + greeting: "The message to print" + } + + input { + String greeting + } + + # Run a task locally defined in this repo + call Hello.Print { input: message = greeting } + + # Run a task remotely defined in the long-read-pipelines repo + call Utils.Sum { input: ints = [1, 2, 3] } + + output { + String message = Print.text + Int sum = Sum.sum + } +} diff --git a/wdl/structs/Structs.wdl b/wdl/structs/Structs.wdl new file mode 100644 index 0000000..eef96b9 --- /dev/null +++ b/wdl/structs/Structs.wdl @@ -0,0 +1,16 @@ +version 1.0 + +struct RuntimeAttr { + Float? mem_gb + Int? cpu_cores + Int? disk_gb + Int? boot_disk_gb + Int? preemptible_tries + Int? max_retries + String? docker +} + +struct DataTypeParameters { + Int num_shards + String map_preset +} diff --git a/wdl/tasks/HelloTask.wdl b/wdl/tasks/HelloTask.wdl new file mode 100644 index 0000000..f25c863 --- /dev/null +++ b/wdl/tasks/HelloTask.wdl @@ -0,0 +1,25 @@ +version 1.0 + +task Print { + input { + String message + } + + command <<< + echo ~{message} + >>> + + output { + String text = read_string(stdout()) + } + + runtime { + cpu: 1 + memory: 10 + " GiB" + disks: "local-disk " + 10 + " HDD" + bootDiskSizeGb: 10 + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.28" + } +} \ No newline at end of file