Merge pull request #2 from heylf/dev

Current additions to set up base pipeline will be merged so more development can happen on different component to push towards first release. Still lots of things missing, but many standard modules are installed in the pipeline already.
nf-core · Sep 25, 2023 · 635b190 · 635b190
2 parents c2676de + b4d8bf4
commit 635b190
Show file tree

Hide file tree

Showing 29 changed files with 935 additions and 256 deletions.
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -24,17 +24,21 @@ class RowChecker:
 
     """
 
-    VALID_FORMATS = (
-        ".fq.gz",
-        ".fastq.gz",
+    VALID_IMAGE_FORMATS = (
+        ".tiff",
+        ".tif"
+    )
+
+    VALID_MARKER_FORMATS = (
+        ".csv"
     )
 
     def __init__(
         self,
         sample_col="sample",
-        first_col="fastq_1",
-        second_col="fastq_2",
-        single_col="single_end",
+        first_col="image",
+        second_col="marker",
+        #single_col="single_end",
         **kwargs,
     ):
         """
@@ -43,20 +47,20 @@ def __init__(
         Args:
             sample_col (str): The name of the column that contains the sample name
                 (default "sample").
-            first_col (str): The name of the column that contains the first (or only)
-                FASTQ file path (default "fastq_1").
-            second_col (str): The name of the column that contains the second (if any)
-                FASTQ file path (default "fastq_2").
-            single_col (str): The name of the new column that will be inserted and
-                records whether the sample contains single- or paired-end sequencing
-                reads (default "single_end").
+            first_col (str): The name of the column that contains the channel name
+                (default "channel").
+            second_col (str): The name of the column that contains the image file
+                image file path (default "tiff").
+            #single_col (str): The name of the new column that will be inserted and
+            #    records whether the sample contains single- or paired-end sequencing
+            #    reads (default "single_end").
 
         """
         super().__init__(**kwargs)
         self._sample_col = sample_col
         self._first_col = first_col
         self._second_col = second_col
-        self._single_col = single_col
+        #self._single_col = single_col
         self._seen = set()
         self.modified = []
 
@@ -72,7 +76,7 @@ def validate_and_transform(self, row):
         self._validate_sample(row)
         self._validate_first(row)
         self._validate_second(row)
-        self._validate_pair(row)
+        #self._validate_pair(row)
         self._seen.add((row[self._sample_col], row[self._first_col]))
         self.modified.append(row)
 
@@ -84,50 +88,59 @@ def _validate_sample(self, row):
         row[self._sample_col] = row[self._sample_col].replace(" ", "_")
 
     def _validate_first(self, row):
-        """Assert that the first FASTQ entry is non-empty and has the right format."""
+        """Assert that the image entry has the right format if it exists."""
         if len(row[self._first_col]) <= 0:
-            raise AssertionError("At least the first FASTQ file is required.")
-        self._validate_fastq_format(row[self._first_col])
+            raise AssertionError("Image required.")
+        self._validate_image_format(row[self._first_col])
 
     def _validate_second(self, row):
-        """Assert that the second FASTQ entry has the right format if it exists."""
-        if len(row[self._second_col]) > 0:
-            self._validate_fastq_format(row[self._second_col])
-
-    def _validate_pair(self, row):
-        """Assert that read pairs have the same file extension. Report pair status."""
-        if row[self._first_col] and row[self._second_col]:
-            row[self._single_col] = False
-            first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
-            second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
-            if first_col_suffix != second_col_suffix:
-                raise AssertionError("FASTQ pairs must have the same file extensions.")
-        else:
-            row[self._single_col] = True
-
-    def _validate_fastq_format(self, filename):
-        """Assert that a given filename has one of the expected FASTQ extensions."""
-        if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
+        """Assert that the image entry has the right format if it exists."""
+        if len(row[self._second_col]) <= 0:
+            raise AssertionError("Marker required.")
+        self._validate_marker_format(row[self._second_col])
+
+    # def _validate_pair(self, row):
+    #     """Assert that read pairs have the same file extension. Report pair status."""
+    #     if row[self._first_col] and row[self._second_col]:
+    #         row[self._single_col] = False
+    #         first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
+    #         second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
+    #         if first_col_suffix != second_col_suffix:
+    #             raise AssertionError("FASTQ pairs must have the same file extensions.")
+    #     else:
+    #         row[self._single_col] = True
+
+    def _validate_image_format(self, filename):
+        """Assert that a given filename has image extension."""
+        if not any(filename.endswith(extension) for extension in self.VALID_IMAGE_FORMATS):
+            raise AssertionError(
+                f"The image file has an unrecognized extension: {filename}\n"
+                f"It should be one of: {', '.join(self.VALID_IMAGE_FORMATS)}"
+            )
+
+    def _validate_marker_format(self, filename):
+        """Assert that a given filename has marker extension."""
+        if not any(filename.endswith(extension) for extension in self.VALID_MARKER_FORMATS):
             raise AssertionError(
-                f"The FASTQ file has an unrecognized extension: {filename}\n"
-                f"It should be one of: {', '.join(self.VALID_FORMATS)}"
+                f"The marker file has an unrecognized extension: {filename}\n"
+                f"It should be one of: {', '.join(self.VALID_MARKER_FORMATS)}"
             )
 
     def validate_unique_samples(self):
         """
-        Assert that the combination of sample name and FASTQ filename is unique.
+        Assert that the combination of sample name and image filename is unique.
 
         In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
-        number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
+        number of times the same sample exist, but with different image files, e.g., multiple runs per experiment.
 
         """
         if len(self._seen) != len(self.modified):
-            raise AssertionError("The pair of sample name and FASTQ must be unique.")
+            raise AssertionError("The pair of channel and image must be unique.")
         seen = Counter()
         for row in self.modified:
             sample = row[self._sample_col]
             seen[sample] += 1
-            row[self._sample_col] = f"{sample}_T{seen[sample]}"
+            #row[self._sample_col] = f"{sample}_T{seen[sample]}"
 
 
 def read_head(handle, num_lines=10):
@@ -166,8 +179,8 @@ def check_samplesheet(file_in, file_out):
     """
     Check that the tabular samplesheet has the structure expected by nf-core pipelines.
 
-    Validate the general shape of the table, expected columns, and each row. Also add
-    an additional column which records whether one or two FASTQ reads were found.
+    Validate the general shape of the table, expected columns, and each row. 
+    # Also add an additional column which records whether one or two FASTQ reads were found.
 
     Args:
         file_in (pathlib.Path): The given tabular samplesheet. The format can be either
@@ -179,16 +192,16 @@ def check_samplesheet(file_in, file_out):
         This function checks that the samplesheet follows the following structure,
         see also the `viral recon samplesheet`_::
 
-            sample,fastq_1,fastq_2
-            SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
-            SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
-            SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
+            sample,ffp,dfp
+            SAMPLE,001,exemplar-001-cycle-08.ome.tiff,markers.csv
+            SAMPLE,001,exemplar-001-cycle-07.ome.tiff,markers.csv
+            SAMPLE,001,exemplar-001-cycle-06.ome.tiff,markers.csv
 
-    .. _viral recon samplesheet:
-        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
+    #.. _viral recon samplesheet:
+    #    https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
 
     """
-    required_columns = {"sample", "fastq_1", "fastq_2"}
+    required_columns = {"sample", "image", "marker"}
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_in.open(newline="") as in_handle:
         reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
@@ -207,7 +220,7 @@ def check_samplesheet(file_in, file_out):
                 sys.exit(1)
         checker.validate_unique_samples()
     header = list(reader.fieldnames)
-    header.insert(1, "single_end")
+    #header.insert(1, "single_end")
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
     with file_out.open(mode="w", newline="") as out_handle:
         writer = csv.DictWriter(out_handle, header, delimiter=",")

diff --git a/conf/modules.config b/conf/modules.config
@@ -26,10 +26,6 @@ process {
         ]
     }
 
-    withName: FASTQC {
-        ext.args = '--quiet'
-    }
-
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
@@ -52,6 +52,7 @@ class WorkflowMain {
     //
     // Get attribute from genome config file e.g. fasta
     //
+    /*
     public static Object getGenomeAttribute(params, attribute) {
         if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) {
             if (params.genomes[ params.genome ].containsKey(attribute)) {
@@ -60,4 +61,5 @@ class WorkflowMain {
         }
         return null
     }
+    */
 }
diff --git a/lib/WorkflowMcmicro.groovy b/lib/WorkflowMcmicro.groovy
@@ -10,15 +10,16 @@ class WorkflowMcmicro {
     //
     // Check and validate parameters
     //
+    /*
     public static void initialise(params, log) {
 
         genomeExistsError(params, log)
 
-
         if (!params.fasta) {
             Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
         }
     }
+    */
 
     //
     // Get workflow summary for MultiQC
@@ -109,6 +110,7 @@ class WorkflowMcmicro {
     //
     // Exit pipeline if incorrect --genome key provided
     //
+    /*
     private static void genomeExistsError(params, log) {
         if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
             def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
@@ -119,4 +121,5 @@ class WorkflowMcmicro {
             Nextflow.error(error_string)
         }
     }
+    */
 }
diff --git a/main.nf b/main.nf
@@ -17,7 +17,7 @@ nextflow.enable.dsl = 2
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta')
+//params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta')
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/modules.json b/modules.json
@@ -5,19 +5,49 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "ashlar": {
+                        "branch": "master",
+                        "git_sha": "97b7dc798a002688b6304a453da932b2144727b1",
+                        "installed_by": ["modules"]
+                    },
+                    "backsub": {
+                        "branch": "master",
+                        "git_sha": "240937a2a9c30298110753292be041188891f2cb",
+                        "installed_by": ["modules"]
+                    },
+                    "basicpy": {
+                        "branch": "master",
+                        "git_sha": "716ef3019b66772a817b417078edce2f7b337858",
+                        "installed_by": ["modules"]
+                    },
+                    "cellpose": {
+                        "branch": "master",
+                        "git_sha": "716ef3019b66772a817b417078edce2f7b337858",
+                        "installed_by": ["modules"]
+                    },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
-                        "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543",
+                        "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4",
                         "installed_by": ["modules"]
                     },
-                    "fastqc": {
+                    "deepcell/mesmer": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "b9829e1064382745d8dff7f1d74d2138d2864f71",
+                        "installed_by": ["modules"]
+                    },
+                    "mcquant": {
+                        "branch": "master",
+                        "git_sha": "b9829e1064382745d8dff7f1d74d2138d2864f71",
                         "installed_by": ["modules"]
                     },
                     "multiqc": {
                         "branch": "master",
-                        "git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",
+                        "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80",
+                        "installed_by": ["modules"]
+                    },
+                    "scimap/mcmicro": {
+                        "branch": "master",
+                        "git_sha": "ebb27711cd5f4de921244bfa81c676504072d31c",
                         "installed_by": ["modules"]
                     }
                 }

diff --git a/modules/nf-core/ashlar/main.nf b/modules/nf-core/ashlar/main.nf