From 75966059a88b3aa3d5c43f8976df0e3a42d38b92 Mon Sep 17 00:00:00 2001
From: Foivos Gypas <fgypas@gmail.com>
Date: Sun, 22 Dec 2024 22:01:13 +0100
Subject: [PATCH] feature: add dockerfile

---
 .github/workflows/ci.yml                      | 35 ++++++++++-
 Dockerfile                                    | 37 ++++++++++++
 docs/guides/usage.md                          | 58 ++++++++++++++++++-
 tests/input_files/config_docker.yaml          | 18 ++++++
 tests/input_files/samples_docker.tsv          |  3 +
 .../test.local.sh                             | 45 ++++++++++++++
 6 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 tests/input_files/config_docker.yaml
 create mode 100644 tests/input_files/samples_docker.tsv
 create mode 100755 tests/test_integration_workflow_with_docker/test.local.sh

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e1b9c19..ad4b1e4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -214,4 +214,37 @@ jobs:
         run: bash tests/test_htsinfer_with_conda/test.local.sh
 
       - name: Run SRA downloads workflow
-        run: bash tests/test_sra_download_with_conda/test.local.sh
\ No newline at end of file
+        run: bash tests/test_sra_download_with_conda/test.local.sh
+
+  integration-docker:
+    needs:
+      - snakemake-graphs-format
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+
+        - name: Checkout zarp repository
+          uses: actions/checkout@v4
+
+        - name: Setup miniconda & zarp env
+          uses: conda-incubator/setup-miniconda@v3
+          with:
+            python-version: "3.10"
+            mamba-version: "1"
+            channels: conda-forge
+            channel-priority: true
+            auto-update-conda: false
+            activate-environment: zarp
+            environment-file: install/environment.yml
+            auto-activate-base: false
+      
+        - name: Update zarp env with dev. packages
+          run: mamba env update -p $CONDA_PREFIX -f install/environment.dev.yml
+        
+        - name: Run test script
+          run: bash tests/test_integration_workflow_with_docker/test.local.sh
+
+        - name: Clean up
+          run: rm -rf data%                                                                         
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..33bd647
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,37 @@
+FROM continuumio/miniconda3:24.7.1-0
+
+
+COPY install/environment.yml /environment.yml
+COPY workflow /workflow
+COPY resources /resources
+COPY tests/input_files/config.yaml /config.yaml
+COPY tests/input_files/samples.tsv /samples.tsv
+COPY tests/input_files/rule_config.yaml /rule_config.yaml
+COPY tests/input_files/project1/synthetic.mate_1.fastq.gz /project1/synthetic.mate_1.fastq.gz
+COPY tests/input_files/project1/synthetic.mate_2.fastq.gz /project1/synthetic.mate_2.fastq.gz
+COPY tests/input_files/project2/synthetic.mate_1.fastq.gz /project2/synthetic.mate_1.fastq.gz
+COPY tests/input_files/homo_sapiens/annotation.gtf /annotation.gtf
+COPY tests/input_files/homo_sapiens/genome.fa /genome.fa
+
+RUN sed -i 's#  - conda-forge##' workflow/envs/STAR.yaml && \
+  sed -i 's#2.7.11#2.7.10#' workflow/envs/STAR.yaml && \
+  sed -i 's#../input_files/project1/#/project1/#g' /samples.tsv && \
+  sed -i 's#../input_files/project2/#/project2/#g' /samples.tsv && \
+  sed -i 's#../input_files/homo_sapiens/##g' /samples.tsv && \
+  sed -i 's#../input_files/##' /config.yaml
+
+RUN conda install -c conda-forge mamba --yes && \
+  mamba env create -f /environment.yml && \
+  conda clean --all --yes
+
+RUN echo "source activate zarp" > ~/.bashrc
+
+ENV SNAKEMAKE_CONDA_PREFIX="/conda_envs"
+ENV PATH=/opt/conda/envs/zarp/bin:$PATH
+
+RUN snakemake -p --snakefile /workflow/Snakefile --configfile /config.yaml --cores 4 --use-conda --conda-create-envs-only --verbose && \
+  conda clean --all --yes
+
+RUN rm /config.yaml /samples.tsv /rule_config.yaml /project1/synthetic.mate_1.fastq.gz  /project1/synthetic.mate_2.fastq.gz /project2/synthetic.mate_1.fastq.gz
+
+RUN mkdir -p /data
\ No newline at end of file
diff --git a/docs/guides/usage.md b/docs/guides/usage.md
index 1a89832..cc46f45 100644
--- a/docs/guides/usage.md
+++ b/docs/guides/usage.md
@@ -162,4 +162,60 @@ snakemake \
 
 However, this call will exit with an error, as not all parameters can be inferred from the example files. The argument `--keep-incomplete` makes sure the `samples_htsinfer.tsv` file can nevertheless be inspected. 
 
-After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`.
\ No newline at end of file
+After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`.
+
+
+## How to use Docker?
+
+ZARP is optimised for Linux users as all packages are available via Conda or Apptainer (Singularity). For other systems like Mac OS X, they don't work especially due to the current transition from Intel to ARM processors (M series). Nevertheless we built a Docker container that can be used to run ZARP in such environments.
+
+1. Install Docker following the instructions [here](https://docs.docker.com/desktop/install/mac-install/)
+
+2. Pull the Docker image the contains the necessary dependencies
+```sh
+docker pull zavolab/zarp:1.0.0-rc.1
+```
+
+3. Create a directoty (e.g. `data`) and store all the files required for a run:
+    - The genome sequence fasta file
+    - The annotation gtf file
+    - The fastq files of your experiments
+    - The `rule_config.yaml` for the parameters
+    - The `samples.tsv` containing the metadata of your samples
+    - The `config.yaml` file with parameters. Below you can find an example file where you can see that it points to files in the `data` directory.
+        ```yaml
+        ---
+          # Required fields
+          samples: "data/samples_docker.tsv"
+          output_dir: "data/results"
+          log_dir: "data/logs"
+          cluster_log_dir: "data/logs/cluster"
+          kallisto_indexes: "data/results/kallisto_indexes"
+          salmon_indexes: "data/results/salmon_indexes"
+          star_indexes: "data/results/star_indexes"
+          alfa_indexes: "data/results/alfa_indexes"
+          # Optional fields
+          rule_config: "data/rule_config.yaml"
+          report_description: "No description provided by user"
+          report_logo: "../../images/logo.128px.png"
+          report_url: "https://zavolan.biozentrum.unibas.ch/"
+          author_name: "NA"
+          author_email: "NA"
+        ...
+        ```
+
+4. Execute ZARP as following:
+    ```sh
+    docker run \
+        --platform linux/x86_64 \
+        --mount type=bind,source=$PWD/data,target=/data \
+        zavolab/zarp:1.0.0-rc.1 \
+        snakemake \
+        -p \
+        --snakefile /workflow/Snakefile \
+        --configfile data/config.yaml \
+        --cores 4 \
+        --use-conda \
+        --verbose
+    ```
+    The command runs the Docker container `zavolab/zarp:1.0.0-rc.1` that we have pulled. It executes it as it would be done on a Linux platform `--platform linux/x86_64`. We use the `--mount` option to bind the local `data` directory that contains the input files with the `data` directory in the container. The pipeline is stored in the container in the path `/workflow/Snakefile`. Once ZARP is complete, the results will be stored in the `data/results` directory.
\ No newline at end of file
diff --git a/tests/input_files/config_docker.yaml b/tests/input_files/config_docker.yaml
new file mode 100644
index 0000000..d43470d
--- /dev/null
+++ b/tests/input_files/config_docker.yaml
@@ -0,0 +1,18 @@
+---
+  # Required fields
+  samples: "data/samples_docker.tsv"
+  output_dir: "data/results"
+  log_dir: "data/logs"
+  cluster_log_dir: "data/logs/cluster"
+  kallisto_indexes: "data/results/kallisto_indexes"
+  salmon_indexes: "data/results/salmon_indexes"
+  star_indexes: "data/results/star_indexes"
+  alfa_indexes: "data/results/alfa_indexes"
+  # Optional fields
+  rule_config: "data/rule_config.yaml"
+  report_description: "No description provided by user"
+  report_logo: "../../images/logo.128px.png"
+  report_url: "https://zavolan.biozentrum.unibas.ch/"
+  author_name: "NA"
+  author_email: "NA"
+...
diff --git a/tests/input_files/samples_docker.tsv b/tests/input_files/samples_docker.tsv
new file mode 100644
index 0000000..f1c1ee7
--- /dev/null
+++ b/tests/input_files/samples_docker.tsv
@@ -0,0 +1,3 @@
+sample	seqmode	fq1	index_size	kmer	fq1_3p	fq1_5p	organism	gtf	genome	sd	mean	libtype	fq1_polya_3p	fq1_polya_5p	fq2	fq2_3p	fq2_5p	fq2_polya_3p	fq2_polya_5p
+synthetic_10_reads_paired_synthetic_10_reads_paired	pe	data/project1/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	data/annotation.gtf	data/genome.fa	100	250	ISF	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	data/project1/synthetic.mate_2.fastq.gz	AGATCGGAAGAGCGT	XXXXXXXXXXXXX	XXXXXXXXXXXXXXXXX	TTTTTTTTTTTTTTTTT
+synthetic_10_reads_mate_1_synthetic_10_reads_mate_1	se	data/project2/synthetic.mate_1.fastq.gz	75	31	AGATCGGAAGAGCACA	XXXXXXXXXXXXX	homo_sapiens	data/annotation.gtf	data/genome.fa	100	250	SF	AAAAAAAAAAAAAAAAA	XXXXXXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX	XXXXXXXXXXXXX
diff --git a/tests/test_integration_workflow_with_docker/test.local.sh b/tests/test_integration_workflow_with_docker/test.local.sh
new file mode 100755
index 0000000..f9cadb9
--- /dev/null
+++ b/tests/test_integration_workflow_with_docker/test.local.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Tear down test environment
+cleanup () {
+    rc=$?
+    rm -rf .cache/
+    rm -rf .config/
+    rm -rf .fontconfig/
+    rm -rf .java/
+    rm -rf .snakemake/
+    cd $user_dir
+    echo "Exit status: $rc"
+}
+trap cleanup EXIT
+
+# Set up test environment
+set -eo pipefail  # ensures that script exits at first command that exits with non-zero status
+set -u  # ensures that script exits when unset variables are used
+set -x  # facilitates debugging by printing out executed commands
+user_dir=$PWD
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+cd $script_dir
+
+mkdir -p data
+cp ../../tests/input_files/homo_sapiens/genome.fa data/genome.fa
+cp ../../tests/input_files/homo_sapiens/annotation.gtf data/annotation.gtf
+cp -r ../../tests/input_files/project1 data/project1
+cp -r ../../tests/input_files/project2 data/project2
+cp -r ../../tests/input_files/config_docker.yaml data/config_docker.yaml
+cp ../../tests/input_files/rule_config.yaml data/rule_config.yaml
+cp ../../tests/input_files/samples_docker.tsv data/samples_docker.tsv
+
+# Pull the zarp container
+docker pull zavolab/zarp:1.0.0-rc.1
+
+# Run tests with Docker
+docker run \
+    --platform linux/x86_64 \
+    --mount type=bind,source=$script_dir/data,target=/data \
+    zavolab/zarp:1.0.0-rc.1 \
+    snakemake \
+    -p \
+    --snakefile /workflow/Snakefile \
+    --configfile data/config_docker.yaml \
+    --cores 4 --use-conda --verbose
\ No newline at end of file