Merge branch 'master' of github.com:grunwaldlab/pathogensurveillance

nf-core · Jul 27, 2024 · 8d39d84 · 8d39d84
2 parents b1998ad + 1716a07
commit 8d39d84
Show file tree

Hide file tree

Showing 23 changed files with 112 additions and 75 deletions.
diff --git a/assets/main_report/_quarto.yml b/assets/main_report/_quarto.yml
@@ -45,13 +45,13 @@ format:
       body-width: 1000px
       margin-width: 0px
       gutter-width: 1.5rem
-  pdf:
-    toc: true
-    documentclass: scrreprt
-    prefer-html: true
-    margin-left: 1.5cm
-    margin-right: 1.5cm
-    margin-top: 2cm
-    margin-bottom: 2cm
+  # pdf:
+  #   toc: true
+  #   documentclass: scrreprt
+  #   prefer-html: true
+  #   margin-left: 1.5cm
+  #   margin-right: 1.5cm
+  #   margin-top: 2cm
+  #   margin-bottom: 2cm
 
 bibliography: references.bib
diff --git a/assets/main_report/index.qmd b/assets/main_report/index.qmd
@@ -12,10 +12,16 @@ nocite: |
 
 ```{r knitr_settings}
 knitr::opts_chunk$set(echo = FALSE, fig.width = 8, warning = FALSE, message = FALSE)
+# "/home/fosterz/data/files/projects/current/pathogensurveillance/work/89/fd5ae1c12d9de756a9475d1320cf50"
+# "_test_data/mixed_bacteria/n_meningitidis_inputs"
+# "_test_data/mixed_bacteria/m_abscessus_inputs/"
+# "_test_data/xanthomonas/xan_test_inputs"
+# "_test_data/mycobacteroides/_no_group_defined__inputs"
+# "_test_data/chaos/all_inputs"
 ```
 
 
-```{r load_libraries}
+```{r load_libraries, include=FALSE}
 library(dplyr)
 library(ggplot2)
 library(readr)
@@ -42,8 +48,10 @@ library(webshot2)
 library(ggdendro)
 library(rcrossref)
 library(devtools)
-# install.packages(c('phylocanvas', 'poppr', 'metacoder'), repos='http://cran.us.r-project.org')
-# devtools::install_github("grunwaldlab/psminer")
+if (!require(phylocanvas)) install.packages('phylocanvas', repos='http://cran.us.r-project.org')
+if (!require(poppr)) install.packages('poppr', repos='http://cran.us.r-project.org')
+if (!require(metacoder)) install.packages('metacoder', repos='http://cran.us.r-project.org')
+if (!require(psminer)) devtools::install_github("grunwaldlab/psminer")
 library(phylocanvas)
 library(poppr)
 library(metacoder)
@@ -94,7 +102,7 @@ The number and severity of issues experienced by the pipeline.
 ```
 
 ```{r, results='asis', include=no_messages}
-cat('✅  **No issues were reported.**')
+cat('✅  **No issues reported.**')
 ```
 
 
@@ -108,7 +116,7 @@ status_message_table(params$inputs)
 ::: {.callout-tip collapse="true"}
 ## About this table
 
-A list of issues reported by the pipeline during execution. When relevant, the sample IDs or reference IDs associated with the isssue are incuded.
+A list of issues reported by the pipeline during execution. When relevant, the sample IDs or reference IDs associated with the issue are included.
 :::
 
 :::
@@ -127,7 +135,7 @@ sample_meta_table(params$inputs)
 
 The following data provides tentative classifications of the samples based on exact matches of a subset of short DNA sequences.
 These are intended to be preliminary identifications.
-For more robust identifications based on whole genome sequences, see the results of the core genome phylogeny below.
+For more robust identifications based on whole genome sequences, see "Phylogenetic context" section below.
 
 ::: panel-tabset
 ### Taxonomic classification summary
@@ -215,7 +223,7 @@ Note that this measure only takes into account the shared portion of genomes, so
 
 ### ANI heatmap
 
-```{r ani_heatmap, fig.height = 10}
+```{r ani_heatmap}
 make_ani_heatmap(ani_matrix * 100, reference_data, sample_data)
 ```
 
@@ -248,7 +256,7 @@ Currently, POCP is only calculated for Prokaryotes.
 
 ### POCP heatmap
 
-```{r pocp_heatmap, fig.height = 10}
+```{r pocp_heatmap}
 make_ani_heatmap(pocp_matrix, reference_data, sample_data)
 ```
 
@@ -311,8 +319,12 @@ Could not generate core genome phylogeny. No tree files found.
 
 ## SNP trees
 
-```{r results='asis'}
+```{r}
 snp_plots <- variant_tree_plot(params$inputs, interactive = FALSE)
+```
+
+
+```{r, eval = length(snp_plots) > 0, results='asis'}
 plot_one <- function(x) {
     print(snp_plots[[x]])
 }
@@ -336,8 +348,12 @@ Question-does it make sense to be showing the reference within the tree?
 ')
 ```
 
-```{r div_no_snp_phylo, eval = length(snp_plots) <= 0}
-cat('There is no tree to draw. See staus messages at the start of the report for details.')
+```{r results='asis', eval = length(snp_plots) <= 0}
+cat('
+There is no tree to draw.
+This can happen when there are too few samples or too few SNPs.
+See staus messages at the start of the report for details.
+')
 ```
 
 
@@ -348,6 +364,7 @@ threshold_options <- c(0.0001, 0.001, 0.01, 0.1)
 align_data <- variant_align_path_data(params$inputs)
 alignments <- variant_align_parsed(params$inputs)
 alignments <- alignments[!unlist(lapply(alignments, is.null))]
+alignments <- alignments[unlist(lapply(alignments, nrow)) > 3]
 align_data <- align_data[align_data$path %in% names(alignments), ]
 sample_data <- sample_meta_parsed(params$inputs)
 ref_data <- ref_meta_parsed(params$inputs)
@@ -364,21 +381,21 @@ ids_used <- unique(unlist(lapply(alignments, function(a) {
 })))
 color_by_cols <- unique(unlist(strsplit(sample_data$color_by[sample_data$sample_id %in% ids_used], split = ';')))
 color_by_cols <- color_by_cols[! is.na(color_by_cols)]
-color_by_col_names <- c(color_by_cols, 'Default')
-color_by_cols <- c(as.list(color_by_cols), list(NULL))  # NULL ensures that the default color scheme is also used
 
-plot_one <- function(ref_id, threshold, color_by) {
+plot_one <- function(ref_id, threshold, color_by = NULL) {
   align <- alignments[[align_data$path[align_data$ref_id == ref_id]]]
   align_without_ref <- align[rownames(align) != ref_id, ]
   psminer:::make_MSN(align_without_ref, sample_data, user_seed = 1, snp_diff_prop = threshold, population = color_by)
 }
 
 plot_vars <- list(
   Reference = align_data$ref_id,
-  Threshold = threshold_options,
-  'Color By' = unlist(color_by_cols)
+  Threshold = threshold_options
 )
-print_figures_with_selector(plot_one, selector = plot_vars, id_prefix = 'snp-msn', width = 1500, height = 1500, res = 200)
+if (length(color_by_cols) > 0) {
+    plot_vars$Color = unlist(color_by_cols)
+}
+print_figures_with_selector(plot_one, selector = plot_vars, id_prefix = 'snp-msn', width = 1500, height = 1200, res = 175)
 ```
 
 
@@ -398,9 +415,11 @@ Note: within these MSNs, edge lengths are not proportional to SNP differences.
  ')
 ```
 
-```{asis div_no_snp_align, echo = length(alignments) <= 0}
+```{r, results='asis', eval = length(alignments) <= 0}
 cat('
-Could not generate minimum spanning networks. This is likely due to not having any FASTA SNP alignment inputs.
+Could not generate minimum spanning networks.
+This can happen when there are too few samples or too few SNPs.
+See staus messages at the start of the report for details.
 ')
 ```
 
@@ -411,12 +430,9 @@ Could not generate minimum spanning networks. This is likely due to not having a
 
 The `pathogen surveillance` pipeline used the following tools that should be referenced as appropriate:
 
--   A sample is first identified to genus using sendsketch and further identified to species using sourmash [@brown2016sourmash].
--   The `nextflow` data-driven computational pipeline enables deployment of complex parallel and reactive workflows [@di2017nextflow].
-
-## {{< bi sliders >}} Input settings
-
-Add settings used to run Nextflow and the pipeline parameters.
+- A sample is first identified to genus using sendsketch and further identified to species using sourmash [@brown2016sourmash].
+- The `nextflow` data-driven computational pipeline enables deployment of complex parallel and reactive workflows [@di2017nextflow].
+- *To be continued...*
 
 ## {{< bi gear-wide-connected >}} Analysis software
 
@@ -430,7 +446,7 @@ version_data$citation <- reflist[match(version_data$program, reflist$program), '
 knitr::kable(version_data)
 ```
 
-## version and packages
+## R packages used
 
 ```{r, include=FALSE}
 # automatically create a bib database for R packages

diff --git a/assets/main_report/packages.bib b/assets/main_report/packages.bib
@@ -68,7 +68,7 @@ @Manual{R-ggnewscale
   title = {ggnewscale: Multiple Fill and Colour Scales in ggplot2},
   author = {Elio Campitelli},
   year = {2024},
-  note = {R package version 0.4.10},
+  note = {R package version 0.5.0},
   url = {https://eliocamp.github.io/ggnewscale/},
 }
 

diff --git a/assets/main_report/styles.css b/assets/main_report/styles.css
@@ -39,9 +39,14 @@ h1.title {
 }
 
 .zoomist-wrapper img {
-    height: 80vh;
-    max-height: 1000px;
+    height: auto;
+    max-height: min(80vh, 1200px);
     width: auto;
     max-width: 100%;
     margin: 0 auto;
+}
+
+.html-widget {
+    height: auto !important;
+    max-height: min(80vh, 1200px) !important;
 }
diff --git a/bin/check_samplesheet.R b/bin/check_samplesheet.R
@@ -110,7 +110,8 @@ known_extensions <- c(
 known_read_types <- c(
     'illumina',
     'nanopore',
-    'pacbio'
+    'pacbio',
+    'bgiseq'
 )
 
 # Regular expression for characters that cannot appear in IDs
@@ -140,6 +141,7 @@ args <- commandArgs(trailingOnly = TRUE)
 args <- as.list(args)
 # args <- list('~/Downloads/sample_data_N273_14ncbigenomes.csv', '~/Downloads/ref_data.csv')
 # args <- list('test/data/metadata/chaos_samples.csv')
+# args <- list("~/Downloads/ncbi_and_usda_3516_metadata.csv")
 metadata_original_samp <- read.csv(args[[1]], check.names = FALSE)
 if (length(args) > 1) {
     metadata_original_ref <- read.csv(args[[2]], check.names = FALSE)
@@ -740,8 +742,8 @@ metadata_samp$sequence_type <- unlist(lapply(seq_along(metadata_samp$sequence_ty
     }))
     if (sum(is_seq_type) == 0) {
         stop(call. = FALSE, paste0(
-            'The value in the "sequence_type" column on row ', index, ' does not contain a known sequence type. ',
-            'One of the following words must appear (case insensitive):\n',
+            'The value in the "sequence_type" column on row ', index, ' ("', metadata_samp$sequence_type[index], '") does not contain a supported sequence type. ',
+            'The following sequencing types are supported (case insensitive):\n',
             paste0('"', known_read_types, '"', collapse = ', '), '\n'
         ))
     }

diff --git a/conf/aps.config → conf/aps_workshop.config b/conf/aps.config → conf/aps_workshop.config
@@ -15,7 +15,7 @@ params {
     config_profile_description = 'Test dataset for the 2024 APS workshop'
 
     // Input data
-    sample_data  = 'https://raw.githubusercontent.com/grunwaldlab/pathogensurveillance/master/test/data/metadata/aps_workshop.csv'
+    sample_data  = 'test/data/metadata/aps_workshop.csv'
     out_dir = 'aps_workshop_output'
     download_bakta_db = true
     cache_type = 'lenient'

diff --git a/conf/modules.config b/conf/modules.config
@@ -110,7 +110,7 @@ process {
     }
 
     withName: BBMAP_SENDSKETCH {
-        ext.args = 'printall=t reads=10m samplerate=0.5 minkeycount=2'
+        ext.args = 'tossbrokenreads=t printall=t reads=10m samplerate=0.5 minkeycount=2'
         cpus   = { check_max( 1                  , 'cpus'    ) }
         memory = { check_max( 1.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 1.h  * task.attempt, 'time'    ) }
@@ -380,6 +380,7 @@ process {
             overwrite: true,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
+        ext.args = "--no-cache"
     }
 
     withName: 'GRAPHTYPER_VCFCONCATENATE' {

diff --git a/dockerfiles/main-report-r-packages/Dockerfile b/dockerfiles/main-report-r-packages/Dockerfile
@@ -79,7 +79,9 @@ EOF
 
 RUN <<EOF
 install.r devtools
-Rscript -e 'devtools::install_github("grunwaldlab/psminer@641727c9e46fb08553a383e6cdf3f501144b3dba")'
+Rscript -e 'devtools::install_github("grunwaldlab/psminer@ba7c73f4a2a5a4983659a911d12bf024574c69bc")'
 EOF
 
+USER root
+
 
diff --git a/dockerfiles/pathogensurveillance-workshop/Dockerfile b/dockerfiles/pathogensurveillance-workshop/Dockerfile
@@ -10,10 +10,15 @@ MAINTAINER Zachary S.L. Foster <[email protected]>
 
 ################## BUILD ######################
 
-USER root
+RUN sudo apt-get update && sudo apt-get install ffmpeg libsm6 libxext6 -y
 
-# Install nanoplot, quast, bandage, multiqc
+RUN <<EOF
+conda create -n qc --yes bioconda::nanoplot bioconda::quast bioconda::multiqc bioconda::bandage
+EOF
 
 RUN <<EOF
-    nextflow run https://github.com/grunwaldlab/pathogensurveillance -profile aps,docker -resume -latest
+conda install -n qc --yes nf-core
 EOF
+
+COPY reads /data/reads
+COPY sample_data.csv /data/sample_data.csv
diff --git a/dockerfiles/pathogensurveillance-workshop/sample_data.csv b/dockerfiles/pathogensurveillance-workshop/sample_data.csv
@@ -0,0 +1,8 @@
+sample_id,path,sequence_type,enabled
+22-315,/data/reads/xan_22-315_nanopore.fastq.gz,nanopore,FALSE
+22-321,/data/reads/xan_22-321_nanopore.fastq.gz,nanopore,FALSE
+22-322,/data/reads/xan_22-322_nanopore.fastq.gz,nanopore,FALSE
+22-323,/data/reads/xan_22-323_nanopore.fastq.gz,nanopore,FALSE
+22-324,/data/reads/xan_22-324_nanopore.fastq.gz,nanopore,FALSE
+22-325,/data/reads/xan_22-325_nanopore.fastq.gz,nanopore,FALSE
+22-331,/data/reads/xan_22-331_nanopore.fastq.gz,nanopore,TRUE
diff --git a/modules/local/align_feature_sequences.nf b/modules/local/align_feature_sequences.nf
@@ -2,7 +2,7 @@ process ALIGN_FEATURE_SEQUENCES {
     tag "$ref_meta.id"
     label 'process_low'
 
-    conda "bioconda::mafft=7.520 bioconda::perl-bioperl=1.7.8 conda-forge::parallel=20230522"
+    conda "conda-forge::mafft=7.526 bioconda::perl-bioperl=1.7.8 conda-forge::parallel=20230522"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mafft:7.508--hec16e2b_0':
         'zachary-foster/mafft-perl' }"

diff --git a/modules/local/main_report.nf b/modules/local/main_report.nf
@@ -2,18 +2,18 @@ process MAIN_REPORT {
     tag "$group_meta.id"
     label 'process_low'
 
-    conda "conda-forge::r-leaflet conda-forge::r-biocmanager conda-forge::quarto conda-forge::r-knitr conda-forge::r-dplyr conda-forge::r-ggplot2 conda-forge::r-readr conda-forge::r-purrr conda-forge::r-yaml conda-forge::r-ape conda-forge::r-magrittr conda-forge::r-pheatmap conda-forge::r-heatmaply conda-forge::r-tidyverse conda-forge::r-palmerpenguins conda-forge::r-ade4 conda-forge::r-adegenet bioconda::bioconductor-ggtree conda-forge::r-igraph conda-forge::r-visnetwork conda-forge::r-phangorn conda-forge::r-ggnewscale conda-forge::r-kableextra conda-forge::r-plotly conda-forge::r-webshot2 conda-forge::r-ggdendro conda-forge::r-rcrossref"
+    conda "conda-forge::r-devtools conda-forge::r-leaflet conda-forge::r-biocmanager conda-forge::quarto conda-forge::r-knitr conda-forge::r-dplyr conda-forge::r-ggplot2 conda-forge::r-readr conda-forge::r-purrr conda-forge::r-yaml conda-forge::r-ape conda-forge::r-magrittr conda-forge::r-pheatmap conda-forge::r-heatmaply conda-forge::r-tidyverse conda-forge::r-palmerpenguins conda-forge::r-ade4 conda-forge::r-adegenet bioconda::bioconductor-ggtree conda-forge::r-igraph conda-forge::r-visnetwork conda-forge::r-phangorn conda-forge::r-ggnewscale conda-forge::r-kableextra conda-forge::r-plotly conda-forge::r-webshot2 conda-forge::r-ggdendro conda-forge::r-rcrossref"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/main-report-r-packages':
-        'docker.io/zacharyfoster/main-report-r-packages:0.10' }"
+        'docker.io/zacharyfoster/main-report-r-packages:0.13' }"
 
     input:
     tuple val(group_meta), file(inputs)
     path template, stageAs: 'main_report_template'
 
     output:
     tuple val(group_meta), path("${prefix}_pathsurveil_report.html"), emit: html
-    tuple val(group_meta), path("${prefix}_pathsurveil_report.pdf") , emit: pdf
+    tuple val(group_meta), path("${prefix}_pathsurveil_report.pdf") , emit: pdf, optional: true
     path "versions.yml"                                             , emit: versions
 
     when:
@@ -28,12 +28,13 @@ process MAIN_REPORT {
 
     # Render the report
     quarto render main_report \\
+        ${args} \\
         --output-dir ${prefix}_report \\
         -P inputs:../${inputs}
 
     # Rename outputs
     mv main_report/${prefix}_report/index.html ${prefix}_pathsurveil_report.html
-    mv main_report/${prefix}_report/index.pdf ${prefix}_pathsurveil_report.pdf
+    #mv main_report/${prefix}_report/index.pdf ${prefix}_pathsurveil_report.pdf
 
     # Save version of quarto used
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/nf-core/bakta/bakta/environment.yml b/modules/nf-core/bakta/bakta/environment.yml
diff --git a/modules/nf-core/mafft/environment.yml b/modules/nf-core/mafft/environment.yml
diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -163,7 +163,7 @@ profiles {
     wagner_2023                 { includeConfig 'conf/wagner_2023.config' }
     wagner_2023_small           { includeConfig 'conf/wagner_2023_small.config' }
     chaos                       { includeConfig 'conf/chaos.config' }
-    aps_workshop                { includeConfig 'conf/aps.config' }
+    aps_workshop                { includeConfig 'conf/aps_workshop.config' }
 }