From 75966059a88b3aa3d5c43f8976df0e3a42d38b92 Mon Sep 17 00:00:00 2001 From: Foivos Gypas Date: Sun, 22 Dec 2024 22:01:13 +0100 Subject: [PATCH] feature: add dockerfile --- .github/workflows/ci.yml | 35 ++++++++++- Dockerfile | 37 ++++++++++++ docs/guides/usage.md | 58 ++++++++++++++++++- tests/input_files/config_docker.yaml | 18 ++++++ tests/input_files/samples_docker.tsv | 3 + .../test.local.sh | 45 ++++++++++++++ 6 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 Dockerfile create mode 100644 tests/input_files/config_docker.yaml create mode 100644 tests/input_files/samples_docker.tsv create mode 100755 tests/test_integration_workflow_with_docker/test.local.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e1b9c19..ad4b1e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -214,4 +214,37 @@ jobs: run: bash tests/test_htsinfer_with_conda/test.local.sh - name: Run SRA downloads workflow - run: bash tests/test_sra_download_with_conda/test.local.sh \ No newline at end of file + run: bash tests/test_sra_download_with_conda/test.local.sh + + integration-docker: + needs: + - snakemake-graphs-format + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash -l {0} + steps: + + - name: Checkout zarp repository + uses: actions/checkout@v4 + + - name: Setup miniconda & zarp env + uses: conda-incubator/setup-miniconda@v3 + with: + python-version: "3.10" + mamba-version: "1" + channels: conda-forge + channel-priority: true + auto-update-conda: false + activate-environment: zarp + environment-file: install/environment.yml + auto-activate-base: false + + - name: Update zarp env with dev. packages + run: mamba env update -p $CONDA_PREFIX -f install/environment.dev.yml + + - name: Run test script + run: bash tests/test_integration_workflow_with_docker/test.local.sh + + - name: Clean up + run: rm -rf data% \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..33bd647 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM continuumio/miniconda3:24.7.1-0 + + +COPY install/environment.yml /environment.yml +COPY workflow /workflow +COPY resources /resources +COPY tests/input_files/config.yaml /config.yaml +COPY tests/input_files/samples.tsv /samples.tsv +COPY tests/input_files/rule_config.yaml /rule_config.yaml +COPY tests/input_files/project1/synthetic.mate_1.fastq.gz /project1/synthetic.mate_1.fastq.gz +COPY tests/input_files/project1/synthetic.mate_2.fastq.gz /project1/synthetic.mate_2.fastq.gz +COPY tests/input_files/project2/synthetic.mate_1.fastq.gz /project2/synthetic.mate_1.fastq.gz +COPY tests/input_files/homo_sapiens/annotation.gtf /annotation.gtf +COPY tests/input_files/homo_sapiens/genome.fa /genome.fa + +RUN sed -i 's# - conda-forge##' workflow/envs/STAR.yaml && \ + sed -i 's#2.7.11#2.7.10#' workflow/envs/STAR.yaml && \ + sed -i 's#../input_files/project1/#/project1/#g' /samples.tsv && \ + sed -i 's#../input_files/project2/#/project2/#g' /samples.tsv && \ + sed -i 's#../input_files/homo_sapiens/##g' /samples.tsv && \ + sed -i 's#../input_files/##' /config.yaml + +RUN conda install -c conda-forge mamba --yes && \ + mamba env create -f /environment.yml && \ + conda clean --all --yes + +RUN echo "source activate zarp" > ~/.bashrc + +ENV SNAKEMAKE_CONDA_PREFIX="/conda_envs" +ENV PATH=/opt/conda/envs/zarp/bin:$PATH + +RUN snakemake -p --snakefile /workflow/Snakefile --configfile /config.yaml --cores 4 --use-conda --conda-create-envs-only --verbose && \ + conda clean --all --yes + +RUN rm /config.yaml /samples.tsv /rule_config.yaml /project1/synthetic.mate_1.fastq.gz /project1/synthetic.mate_2.fastq.gz /project2/synthetic.mate_1.fastq.gz + +RUN mkdir -p /data \ No newline at end of file diff --git a/docs/guides/usage.md b/docs/guides/usage.md index 1a89832..cc46f45 100644 --- a/docs/guides/usage.md +++ b/docs/guides/usage.md @@ -162,4 +162,60 @@ snakemake \ However, this call will exit with an error, as not all parameters can be inferred from the example files. The argument `--keep-incomplete` makes sure the `samples_htsinfer.tsv` file can nevertheless be inspected. -After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`. \ No newline at end of file +After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`. + + +## How to use Docker? + +ZARP is optimised for Linux users as all packages are available via Conda or Apptainer (Singularity). For other systems like Mac OS X, they don't work especially due to the current transition from Intel to ARM processors (M series). Nevertheless we built a Docker container that can be used to run ZARP in such environments. + +1. Install Docker following the instructions [here](https://docs.docker.com/desktop/install/mac-install/) + +2. Pull the Docker image the contains the necessary dependencies +```sh +docker pull zavolab/zarp:1.0.0-rc.1 +``` + +3. Create a directoty (e.g. `data`) and store all the files required for a run: + - The genome sequence fasta file + - The annotation gtf file + - The fastq files of your experiments + - The `rule_config.yaml` for the parameters + - The `samples.tsv` containing the metadata of your samples + - The `config.yaml` file with parameters. Below you can find an example file where you can see that it points to files in the `data` directory. + ```yaml + --- + # Required fields + samples: "data/samples_docker.tsv" + output_dir: "data/results" + log_dir: "data/logs" + cluster_log_dir: "data/logs/cluster" + kallisto_indexes: "data/results/kallisto_indexes" + salmon_indexes: "data/results/salmon_indexes" + star_indexes: "data/results/star_indexes" + alfa_indexes: "data/results/alfa_indexes" + # Optional fields + rule_config: "data/rule_config.yaml" + report_description: "No description provided by user" + report_logo: "../../images/logo.128px.png" + report_url: "https://zavolan.biozentrum.unibas.ch/" + author_name: "NA" + author_email: "NA" + ... + ``` + +4. Execute ZARP as following: + ```sh + docker run \ + --platform linux/x86_64 \ + --mount type=bind,source=$PWD/data,target=/data \ + zavolab/zarp:1.0.0-rc.1 \ + snakemake \ + -p \ + --snakefile /workflow/Snakefile \ + --configfile data/config.yaml \ + --cores 4 \ + --use-conda \ + --verbose + ``` + The command runs the Docker container `zavolab/zarp:1.0.0-rc.1` that we have pulled. It executes it as it would be done on a Linux platform `--platform linux/x86_64`. We use the `--mount` option to bind the local `data` directory that contains the input files with the `data` directory in the container. The pipeline is stored in the container in the path `/workflow/Snakefile`. Once ZARP is complete, the results will be stored in the `data/results` directory. \ No newline at end of file diff --git a/tests/input_files/config_docker.yaml b/tests/input_files/config_docker.yaml new file mode 100644 index 0000000..d43470d --- /dev/null +++ b/tests/input_files/config_docker.yaml @@ -0,0 +1,18 @@ +--- + # Required fields + samples: "data/samples_docker.tsv" + output_dir: "data/results" + log_dir: "data/logs" + cluster_log_dir: "data/logs/cluster" + kallisto_indexes: "data/results/kallisto_indexes" + salmon_indexes: "data/results/salmon_indexes" + star_indexes: "data/results/star_indexes" + alfa_indexes: "data/results/alfa_indexes" + # Optional fields + rule_config: "data/rule_config.yaml" + report_description: "No description provided by user" + report_logo: "../../images/logo.128px.png" + report_url: "https://zavolan.biozentrum.unibas.ch/" + author_name: "NA" + author_email: "NA" +... diff --git a/tests/input_files/samples_docker.tsv b/tests/input_files/samples_docker.tsv new file mode 100644 index 0000000..f1c1ee7 --- /dev/null +++ b/tests/input_files/samples_docker.tsv @@ -0,0 +1,3 @@ +sample seqmode fq1 index_size kmer fq1_3p fq1_5p organism gtf genome sd mean libtype fq1_polya_3p fq1_polya_5p fq2 fq2_3p fq2_5p fq2_polya_3p fq2_polya_5p +synthetic_10_reads_paired_synthetic_10_reads_paired pe data/project1/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens data/annotation.gtf data/genome.fa 100 250 ISF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX data/project1/synthetic.mate_2.fastq.gz AGATCGGAAGAGCGT XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX TTTTTTTTTTTTTTTTT +synthetic_10_reads_mate_1_synthetic_10_reads_mate_1 se data/project2/synthetic.mate_1.fastq.gz 75 31 AGATCGGAAGAGCACA XXXXXXXXXXXXX homo_sapiens data/annotation.gtf data/genome.fa 100 250 SF AAAAAAAAAAAAAAAAA XXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX diff --git a/tests/test_integration_workflow_with_docker/test.local.sh b/tests/test_integration_workflow_with_docker/test.local.sh new file mode 100755 index 0000000..f9cadb9 --- /dev/null +++ b/tests/test_integration_workflow_with_docker/test.local.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Tear down test environment +cleanup () { + rc=$? + rm -rf .cache/ + rm -rf .config/ + rm -rf .fontconfig/ + rm -rf .java/ + rm -rf .snakemake/ + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +mkdir -p data +cp ../../tests/input_files/homo_sapiens/genome.fa data/genome.fa +cp ../../tests/input_files/homo_sapiens/annotation.gtf data/annotation.gtf +cp -r ../../tests/input_files/project1 data/project1 +cp -r ../../tests/input_files/project2 data/project2 +cp -r ../../tests/input_files/config_docker.yaml data/config_docker.yaml +cp ../../tests/input_files/rule_config.yaml data/rule_config.yaml +cp ../../tests/input_files/samples_docker.tsv data/samples_docker.tsv + +# Pull the zarp container +docker pull zavolab/zarp:1.0.0-rc.1 + +# Run tests with Docker +docker run \ + --platform linux/x86_64 \ + --mount type=bind,source=$script_dir/data,target=/data \ + zavolab/zarp:1.0.0-rc.1 \ + snakemake \ + -p \ + --snakefile /workflow/Snakefile \ + --configfile data/config_docker.yaml \ + --cores 4 --use-conda --verbose \ No newline at end of file