feature: add dockerfile

zavolanlab · Dec 22, 2024 · 7596605 · 7596605
1 parent 53ed475
commit 7596605
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 2 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -214,4 +214,37 @@ jobs:
         run: bash tests/test_htsinfer_with_conda/test.local.sh
 
       - name: Run SRA downloads workflow
-        run: bash tests/test_sra_download_with_conda/test.local.sh
+        run: bash tests/test_sra_download_with_conda/test.local.sh
+
+  integration-docker:
+    needs:
+      - snakemake-graphs-format
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+
+        - name: Checkout zarp repository
+          uses: actions/checkout@v4
+
+        - name: Setup miniconda & zarp env
+          uses: conda-incubator/setup-miniconda@v3
+          with:
+            python-version: "3.10"
+            mamba-version: "1"
+            channels: conda-forge
+            channel-priority: true
+            auto-update-conda: false
+            activate-environment: zarp
+            environment-file: install/environment.yml
+            auto-activate-base: false
+
+        - name: Update zarp env with dev. packages
+          run: mamba env update -p $CONDA_PREFIX -f install/environment.dev.yml
+
+        - name: Run test script
+          run: bash tests/test_integration_workflow_with_docker/test.local.sh
+
+        - name: Clean up
+          run: rm -rf data%                                                                         
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,37 @@
+FROM continuumio/miniconda3:24.7.1-0
+
+
+COPY install/environment.yml /environment.yml
+COPY workflow /workflow
+COPY resources /resources
+COPY tests/input_files/config.yaml /config.yaml
+COPY tests/input_files/samples.tsv /samples.tsv
+COPY tests/input_files/rule_config.yaml /rule_config.yaml
+COPY tests/input_files/project1/synthetic.mate_1.fastq.gz /project1/synthetic.mate_1.fastq.gz
+COPY tests/input_files/project1/synthetic.mate_2.fastq.gz /project1/synthetic.mate_2.fastq.gz
+COPY tests/input_files/project2/synthetic.mate_1.fastq.gz /project2/synthetic.mate_1.fastq.gz
+COPY tests/input_files/homo_sapiens/annotation.gtf /annotation.gtf
+COPY tests/input_files/homo_sapiens/genome.fa /genome.fa
+
+RUN sed -i 's#  - conda-forge##' workflow/envs/STAR.yaml && \
+  sed -i 's#2.7.11#2.7.10#' workflow/envs/STAR.yaml && \
+  sed -i 's#../input_files/project1/#/project1/#g' /samples.tsv && \
+  sed -i 's#../input_files/project2/#/project2/#g' /samples.tsv && \
+  sed -i 's#../input_files/homo_sapiens/##g' /samples.tsv && \
+  sed -i 's#../input_files/##' /config.yaml
+
+RUN conda install -c conda-forge mamba --yes && \
+  mamba env create -f /environment.yml && \
+  conda clean --all --yes
+
+RUN echo "source activate zarp" > ~/.bashrc
+
+ENV SNAKEMAKE_CONDA_PREFIX="/conda_envs"
+ENV PATH=/opt/conda/envs/zarp/bin:$PATH
+
+RUN snakemake -p --snakefile /workflow/Snakefile --configfile /config.yaml --cores 4 --use-conda --conda-create-envs-only --verbose && \
+  conda clean --all --yes
+
+RUN rm /config.yaml /samples.tsv /rule_config.yaml /project1/synthetic.mate_1.fastq.gz  /project1/synthetic.mate_2.fastq.gz /project2/synthetic.mate_1.fastq.gz
+
+RUN mkdir -p /data
diff --git a/docs/guides/usage.md b/docs/guides/usage.md
@@ -162,4 +162,60 @@ snakemake \
 
 However, this call will exit with an error, as not all parameters can be inferred from the example files. The argument `--keep-incomplete` makes sure the `samples_htsinfer.tsv` file can nevertheless be inspected. 
 
-After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`.
+After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`.
+
+
+## How to use Docker?
+
+ZARP is optimised for Linux users as all packages are available via Conda or Apptainer (Singularity). For other systems like Mac OS X, they don't work especially due to the current transition from Intel to ARM processors (M series). Nevertheless we built a Docker container that can be used to run ZARP in such environments.
+
+1. Install Docker following the instructions [here](https://docs.docker.com/desktop/install/mac-install/)
+
+2. Pull the Docker image the contains the necessary dependencies
+```sh
+docker pull zavolab/zarp:1.0.0-rc.1
+```
+
+3. Create a directoty (e.g. `data`) and store all the files required for a run:
+    - The genome sequence fasta file
+    - The annotation gtf file
+    - The fastq files of your experiments
+    - The `rule_config.yaml` for the parameters
+    - The `samples.tsv` containing the metadata of your samples
+    - The `config.yaml` file with parameters. Below you can find an example file where you can see that it points to files in the `data` directory.
+        ```yaml
+        ---
+          # Required fields
+          samples: "data/samples_docker.tsv"
+          output_dir: "data/results"
+          log_dir: "data/logs"
+          cluster_log_dir: "data/logs/cluster"
+          kallisto_indexes: "data/results/kallisto_indexes"
+          salmon_indexes: "data/results/salmon_indexes"
+          star_indexes: "data/results/star_indexes"
+          alfa_indexes: "data/results/alfa_indexes"
+          # Optional fields
+          rule_config: "data/rule_config.yaml"
+          report_description: "No description provided by user"
+          report_logo: "../../images/logo.128px.png"
+          report_url: "https://zavolan.biozentrum.unibas.ch/"
+          author_name: "NA"
+          author_email: "NA"
+        ...
+        ```
+
+4. Execute ZARP as following:
+    ```sh
+    docker run \
+        --platform linux/x86_64 \
+        --mount type=bind,source=$PWD/data,target=/data \
+        zavolab/zarp:1.0.0-rc.1 \
+        snakemake \
+        -p \
+        --snakefile /workflow/Snakefile \
+        --configfile data/config.yaml \
+        --cores 4 \
+        --use-conda \
+        --verbose
+    ```
+    The command runs the Docker container `zavolab/zarp:1.0.0-rc.1` that we have pulled. It executes it as it would be done on a Linux platform `--platform linux/x86_64`. We use the `--mount` option to bind the local `data` directory that contains the input files with the `data` directory in the container. The pipeline is stored in the container in the path `/workflow/Snakefile`. Once ZARP is complete, the results will be stored in the `data/results` directory.
diff --git a/tests/input_files/config_docker.yaml b/tests/input_files/config_docker.yaml
@@ -0,0 +1,18 @@
+---
+  # Required fields
+  samples: "data/samples_docker.tsv"
+  output_dir: "data/results"
+  log_dir: "data/logs"
+  cluster_log_dir: "data/logs/cluster"
+  kallisto_indexes: "data/results/kallisto_indexes"
+  salmon_indexes: "data/results/salmon_indexes"
+  star_indexes: "data/results/star_indexes"
+  alfa_indexes: "data/results/alfa_indexes"
+  # Optional fields
+  rule_config: "data/rule_config.yaml"
+  report_description: "No description provided by user"
+  report_logo: "../../images/logo.128px.png"
+  report_url: "https://zavolan.biozentrum.unibas.ch/"
+  author_name: "NA"
+  author_email: "NA"
+...
diff --git a/tests/input_files/samples_docker.tsv b/tests/input_files/samples_docker.tsv
@@ -0,0 +1,3 @@
+sample	seqmode	fq1	index_size	kmer	fq1_3p	fq1_5p	organism	gtf	genome	sd	mean	libtype	fq1_polya_3p	fq1_polya_5p	fq2	fq2_3p	fq2_5p	fq2_polya_3p	fq2_polya_5p
+synthetic_10_reads_paired_synthetic_10_reads_paired	pe	data/project1/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	data/annotation.gtf	data/genome.fa	100	250	ISF	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	data/project1/synthetic.mate_2.fastq.gz	AGATCGGAAGAGCGT	XXXXXXXXXXXXX	XXXXXXXXXXXXXXXXX	TTTTTTTTTTTTTTTTT
+synthetic_10_reads_mate_1_synthetic_10_reads_mate_1	se	data/project2/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	data/annotation.gtf	data/genome.fa	100	250	SF	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX
diff --git a/tests/test_integration_workflow_with_docker/test.local.sh b/tests/test_integration_workflow_with_docker/test.local.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Tear down test environment
+cleanup () {
+    rc=$?
+    rm -rf .cache/
+    rm -rf .config/
+    rm -rf .fontconfig/
+    rm -rf .java/
+    rm -rf .snakemake/
+    cd $user_dir
+    echo "Exit status: $rc"
+}
+trap cleanup EXIT
+
+# Set up test environment
+set -eo pipefail  # ensures that script exits at first command that exits with non-zero status
+set -u  # ensures that script exits when unset variables are used
+set -x  # facilitates debugging by printing out executed commands
+user_dir=$PWD
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+cd $script_dir
+
+mkdir -p data
+cp ../../tests/input_files/homo_sapiens/genome.fa data/genome.fa
+cp ../../tests/input_files/homo_sapiens/annotation.gtf data/annotation.gtf
+cp -r ../../tests/input_files/project1 data/project1
+cp -r ../../tests/input_files/project2 data/project2
+cp -r ../../tests/input_files/config_docker.yaml data/config_docker.yaml
+cp ../../tests/input_files/rule_config.yaml data/rule_config.yaml
+cp ../../tests/input_files/samples_docker.tsv data/samples_docker.tsv
+
+# Pull the zarp container
+docker pull zavolab/zarp:1.0.0-rc.1
+
+# Run tests with Docker
+docker run \
+    --platform linux/x86_64 \
+    --mount type=bind,source=$script_dir/data,target=/data \
+    zavolab/zarp:1.0.0-rc.1 \
+    snakemake \
+    -p \
+    --snakefile /workflow/Snakefile \
+    --configfile data/config_docker.yaml \
+    --cores 4 --use-conda --verbose