Skip to content

Commit

Permalink
Merge pull request #2 from heylf/dev
Browse files Browse the repository at this point in the history
Current additions to set up base pipeline will be merged so more development can happen on different component to push towards first release. Still lots of things missing, but many standard modules are installed in the pipeline already.
  • Loading branch information
FloWuenne authored Sep 25, 2023
2 parents c2676de + b4d8bf4 commit 635b190
Show file tree
Hide file tree
Showing 29 changed files with 935 additions and 256 deletions.
117 changes: 65 additions & 52 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,21 @@ class RowChecker:
"""

VALID_FORMATS = (
".fq.gz",
".fastq.gz",
VALID_IMAGE_FORMATS = (
".tiff",
".tif"
)

VALID_MARKER_FORMATS = (
".csv"
)

def __init__(
self,
sample_col="sample",
first_col="fastq_1",
second_col="fastq_2",
single_col="single_end",
first_col="image",
second_col="marker",
#single_col="single_end",
**kwargs,
):
"""
Expand All @@ -43,20 +47,20 @@ def __init__(
Args:
sample_col (str): The name of the column that contains the sample name
(default "sample").
first_col (str): The name of the column that contains the first (or only)
FASTQ file path (default "fastq_1").
second_col (str): The name of the column that contains the second (if any)
FASTQ file path (default "fastq_2").
single_col (str): The name of the new column that will be inserted and
records whether the sample contains single- or paired-end sequencing
reads (default "single_end").
first_col (str): The name of the column that contains the channel name
(default "channel").
second_col (str): The name of the column that contains the image file
image file path (default "tiff").
#single_col (str): The name of the new column that will be inserted and
# records whether the sample contains single- or paired-end sequencing
# reads (default "single_end").
"""
super().__init__(**kwargs)
self._sample_col = sample_col
self._first_col = first_col
self._second_col = second_col
self._single_col = single_col
#self._single_col = single_col
self._seen = set()
self.modified = []

Expand All @@ -72,7 +76,7 @@ def validate_and_transform(self, row):
self._validate_sample(row)
self._validate_first(row)
self._validate_second(row)
self._validate_pair(row)
#self._validate_pair(row)
self._seen.add((row[self._sample_col], row[self._first_col]))
self.modified.append(row)

Expand All @@ -84,50 +88,59 @@ def _validate_sample(self, row):
row[self._sample_col] = row[self._sample_col].replace(" ", "_")

def _validate_first(self, row):
"""Assert that the first FASTQ entry is non-empty and has the right format."""
"""Assert that the image entry has the right format if it exists."""
if len(row[self._first_col]) <= 0:
raise AssertionError("At least the first FASTQ file is required.")
self._validate_fastq_format(row[self._first_col])
raise AssertionError("Image required.")
self._validate_image_format(row[self._first_col])

def _validate_second(self, row):
"""Assert that the second FASTQ entry has the right format if it exists."""
if len(row[self._second_col]) > 0:
self._validate_fastq_format(row[self._second_col])

def _validate_pair(self, row):
"""Assert that read pairs have the same file extension. Report pair status."""
if row[self._first_col] and row[self._second_col]:
row[self._single_col] = False
first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
if first_col_suffix != second_col_suffix:
raise AssertionError("FASTQ pairs must have the same file extensions.")
else:
row[self._single_col] = True

def _validate_fastq_format(self, filename):
"""Assert that a given filename has one of the expected FASTQ extensions."""
if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
"""Assert that the image entry has the right format if it exists."""
if len(row[self._second_col]) <= 0:
raise AssertionError("Marker required.")
self._validate_marker_format(row[self._second_col])

# def _validate_pair(self, row):
# """Assert that read pairs have the same file extension. Report pair status."""
# if row[self._first_col] and row[self._second_col]:
# row[self._single_col] = False
# first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
# second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
# if first_col_suffix != second_col_suffix:
# raise AssertionError("FASTQ pairs must have the same file extensions.")
# else:
# row[self._single_col] = True

def _validate_image_format(self, filename):
"""Assert that a given filename has image extension."""
if not any(filename.endswith(extension) for extension in self.VALID_IMAGE_FORMATS):
raise AssertionError(
f"The image file has an unrecognized extension: {filename}\n"
f"It should be one of: {', '.join(self.VALID_IMAGE_FORMATS)}"
)

def _validate_marker_format(self, filename):
"""Assert that a given filename has marker extension."""
if not any(filename.endswith(extension) for extension in self.VALID_MARKER_FORMATS):
raise AssertionError(
f"The FASTQ file has an unrecognized extension: {filename}\n"
f"It should be one of: {', '.join(self.VALID_FORMATS)}"
f"The marker file has an unrecognized extension: {filename}\n"
f"It should be one of: {', '.join(self.VALID_MARKER_FORMATS)}"
)

def validate_unique_samples(self):
"""
Assert that the combination of sample name and FASTQ filename is unique.
Assert that the combination of sample name and image filename is unique.
In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
number of times the same sample exist, but with different image files, e.g., multiple runs per experiment.
"""
if len(self._seen) != len(self.modified):
raise AssertionError("The pair of sample name and FASTQ must be unique.")
raise AssertionError("The pair of channel and image must be unique.")
seen = Counter()
for row in self.modified:
sample = row[self._sample_col]
seen[sample] += 1
row[self._sample_col] = f"{sample}_T{seen[sample]}"
#row[self._sample_col] = f"{sample}_T{seen[sample]}"


def read_head(handle, num_lines=10):
Expand Down Expand Up @@ -166,8 +179,8 @@ def check_samplesheet(file_in, file_out):
"""
Check that the tabular samplesheet has the structure expected by nf-core pipelines.
Validate the general shape of the table, expected columns, and each row. Also add
an additional column which records whether one or two FASTQ reads were found.
Validate the general shape of the table, expected columns, and each row.
# Also add an additional column which records whether one or two FASTQ reads were found.
Args:
file_in (pathlib.Path): The given tabular samplesheet. The format can be either
Expand All @@ -179,16 +192,16 @@ def check_samplesheet(file_in, file_out):
This function checks that the samplesheet follows the following structure,
see also the `viral recon samplesheet`_::
sample,fastq_1,fastq_2
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
sample,ffp,dfp
SAMPLE,001,exemplar-001-cycle-08.ome.tiff,markers.csv
SAMPLE,001,exemplar-001-cycle-07.ome.tiff,markers.csv
SAMPLE,001,exemplar-001-cycle-06.ome.tiff,markers.csv
.. _viral recon samplesheet:
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
#.. _viral recon samplesheet:
# https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
"""
required_columns = {"sample", "fastq_1", "fastq_2"}
required_columns = {"sample", "image", "marker"}
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
with file_in.open(newline="") as in_handle:
reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
Expand All @@ -207,7 +220,7 @@ def check_samplesheet(file_in, file_out):
sys.exit(1)
checker.validate_unique_samples()
header = list(reader.fieldnames)
header.insert(1, "single_end")
#header.insert(1, "single_end")
# See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
with file_out.open(mode="w", newline="") as out_handle:
writer = csv.DictWriter(out_handle, header, delimiter=",")
Expand Down
4 changes: 0 additions & 4 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ process {
]
}

withName: FASTQC {
ext.args = '--quiet'
}

withName: CUSTOM_DUMPSOFTWAREVERSIONS {
publishDir = [
path: { "${params.outdir}/pipeline_info" },
Expand Down
2 changes: 2 additions & 0 deletions lib/WorkflowMain.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class WorkflowMain {
//
// Get attribute from genome config file e.g. fasta
//
/*
public static Object getGenomeAttribute(params, attribute) {
if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) {
if (params.genomes[ params.genome ].containsKey(attribute)) {
Expand All @@ -60,4 +61,5 @@ class WorkflowMain {
}
return null
}
*/
}
5 changes: 4 additions & 1 deletion lib/WorkflowMcmicro.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,16 @@ class WorkflowMcmicro {
//
// Check and validate parameters
//
/*
public static void initialise(params, log) {
genomeExistsError(params, log)

if (!params.fasta) {
Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
}
}
*/

//
// Get workflow summary for MultiQC
Expand Down Expand Up @@ -109,6 +110,7 @@ class WorkflowMcmicro {
//
// Exit pipeline if incorrect --genome key provided
//
/*
private static void genomeExistsError(params, log) {
if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
Expand All @@ -119,4 +121,5 @@ class WorkflowMcmicro {
Nextflow.error(error_string)
}
}
*/
}
2 changes: 1 addition & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ nextflow.enable.dsl = 2
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta')
//params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta')

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
38 changes: 34 additions & 4 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,49 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"ashlar": {
"branch": "master",
"git_sha": "97b7dc798a002688b6304a453da932b2144727b1",
"installed_by": ["modules"]
},
"backsub": {
"branch": "master",
"git_sha": "240937a2a9c30298110753292be041188891f2cb",
"installed_by": ["modules"]
},
"basicpy": {
"branch": "master",
"git_sha": "716ef3019b66772a817b417078edce2f7b337858",
"installed_by": ["modules"]
},
"cellpose": {
"branch": "master",
"git_sha": "716ef3019b66772a817b417078edce2f7b337858",
"installed_by": ["modules"]
},
"custom/dumpsoftwareversions": {
"branch": "master",
"git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543",
"git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4",
"installed_by": ["modules"]
},
"fastqc": {
"deepcell/mesmer": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
"git_sha": "b9829e1064382745d8dff7f1d74d2138d2864f71",
"installed_by": ["modules"]
},
"mcquant": {
"branch": "master",
"git_sha": "b9829e1064382745d8dff7f1d74d2138d2864f71",
"installed_by": ["modules"]
},
"multiqc": {
"branch": "master",
"git_sha": "f2d63bd5b68925f98f572eed70993d205cc694b7",
"git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80",
"installed_by": ["modules"]
},
"scimap/mcmicro": {
"branch": "master",
"git_sha": "ebb27711cd5f4de921244bfa81c676504072d31c",
"installed_by": ["modules"]
}
}
Expand Down
52 changes: 52 additions & 0 deletions modules/nf-core/ashlar/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 635b190

Please sign in to comment.