From 365f47b6ee0354b388326348829b5a4a4b1fa271 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 24 Oct 2024 14:25:58 -0500 Subject: [PATCH 01/32] added sample irida_next sample field option --- assets/schema_input.json | 9 ++++++++- nextflow.config | 2 +- subworkflows/local/input_check.nf | 12 +++++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 1e092ed1..c6defb68 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,10 +7,17 @@ "items": { "type": "object", "properties": { + "sample_name": { + "type": "string", + "pattern": "^\\S+$", + "meta": ["irida_id"], + "unique": true, + "errorMessage": "Sample name must be provided and cannot contain spaces" + }, "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", + "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next.", "meta": ["id"] }, "fastq_1": { diff --git a/nextflow.config b/nextflow.config index 8a0c5552..c90c43e6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,7 +43,7 @@ params { show_hidden_params = false validationS3PathCheck = true validationShowHiddenParams = false - validationSchemaIgnoreParams = 'locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' + validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' validationFailUnrecognisedParams = false // for the qcreport fields // SKIP options diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 2792d88b..1cd2787b 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,7 +9,6 @@ workflow INPUT_CHECK { main: - // TODO add in automatic gzipping of all samples in versions = Channel.empty() def sample_sheet = params.input reads_in = Channel.fromSamplesheet( @@ -102,8 +101,15 @@ def check_file_exists(def file_path){ def format_reads(ArrayList sheet_data){ def meta = [:] def error_occured = false - meta.id = sheet_data[0] // id is first value - meta.sample = sheet_data[0] // Sample will be id currently + if(sheet_data[1].irida_id != null){ + meta.irida_id = sheet_data[1].irida_id + meta.id = sheet_data[0] // id is first value + meta.sample = sheet_data[0] // Sample will be id currently + }else{ + meta.id = sheet_data[0] // id is first value + meta.sample = sheet_data[0] // Sample will be id currently + } + meta.hybrid = false meta.assembly = false meta.downsampled = false From 7edf2aa35d75c7000984ec02138c6441bc6e71f2 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 24 Oct 2024 17:01:39 -0500 Subject: [PATCH 02/32] identified sticking point for sample names not being passed to the iridanext config --- conf/irida_next.config | 2 +- main.nf | 1 + modules/local/report.nf | 4 ++ nextflow.config | 3 +- tests/main.nf.test | 126 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 134 insertions(+), 2 deletions(-) diff --git a/conf/irida_next.config b/conf/irida_next.config index 24cc2d07..567e570c 100755 --- a/conf/irida_next.config +++ b/conf/irida_next.config @@ -11,7 +11,7 @@ iridanext { overwrite = true validate = false files { - idkey = "sample" + idkey = "irida_id" global = [ "**/FinalReports/Aggregated/Json/final_report.json", "**/FinalReports/Aggregated/Tables/final_report.tsv" diff --git a/main.nf b/main.nf index 71223572..24336b72 100644 --- a/main.nf +++ b/main.nf @@ -112,6 +112,7 @@ workflow MIKROKONDO { ch_versions = ch_versions.mix(REPORT_AGGREGATE.out.versions) + // TODO need to add logic to merge this channel with a previous one to get its INX id updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{ sample -> def name_trim = sample.getName() diff --git a/modules/local/report.nf b/modules/local/report.nf index 0eccfe08..9b9b015a 100644 --- a/modules/local/report.nf +++ b/modules/local/report.nf @@ -47,6 +47,10 @@ process REPORT{ sample_data[meta_data.sample]["meta"] = [:] } + + // TODO add a condition around this to only be appened if iridanext is enabled + update_map_values(sample_data, meta_data, "irida_id") + update_map_values(sample_data, meta_data, "metagenomic") update_map_values(sample_data, meta_data, "assembly") update_map_values(sample_data, meta_data, "hybrid") diff --git a/nextflow.config b/nextflow.config index c90c43e6..256766f3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,6 +7,7 @@ */ + // Global default params, used in configs params { // Input options @@ -43,7 +44,7 @@ params { show_hidden_params = false validationS3PathCheck = true validationShowHiddenParams = false - validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' + validationSchemaIgnoreParams = '__in_iridanext,rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' validationFailUnrecognisedParams = false // for the qcreport fields // SKIP options diff --git a/tests/main.nf.test b/tests/main.nf.test index 261ff7cb..5d741ee6 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -100,6 +100,132 @@ nextflow_pipeline { } } + test("Should run without failure unzipped IRIDANext id") { + tag "succeed_assembly_inx_id" + + when { + params { + input = "https://raw.githubusercontent.com/phac-nml/mikrokondo/refs/heads/dev/tests/data/samplesheets/samplesheet-small-assembly-inx.csv" + outdir = "results" + platform = "illumina" + + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + kraken2_db = "$baseDir/tests/data/kraken2/test" + + min_reads = 100 + skip_allele_calling = true + + QCReport { + fallthrough { + search = "No organism specific QC data available." + raw_average_quality = 30 + min_n50 = null + max_n50 = null + min_nr_contigs = null + max_nr_contigs = null + fixed_genome_size = 1000 + min_length = null + max_length = null + max_checkm_contamination = 3.0 + min_average_coverage = 30 + } + } + + + skip_bakta = true + skip_staramr = false + skip_mobrecon = false + skip_checkm = false + skip_raw_read_metrics = false + skip_polishing = false + + max_memory = "2.GB" + max_cpus = 1 + } + } + + then { + + assert workflow.success + assert path("$launchDir/results").exists() + + // parse output json file + def json = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json + + assert json.short.short.RawReadSummary.R1."total_bp".equals(118750) + assert json.short.short.RawReadSummary.R1."total_reads".equals(475) + assert json.short.short.RawReadSummary.R1."read_qual_mean".equals(40.0) + assert json.short.short.RawReadSummary.R1."mean_sequence_length".equals(250.0) + + assert json.short.short.FastP.summary.sequencing.equals("paired end (250 cycles + 250 cycles)") + assert json.short.short.FastP.summary.before_filtering.total_reads.equals(950) + assert json.short.short.FastP.filtering_result.passed_filter_reads.equals(950) + assert json.short.short.FastP.filtering_result.low_quality_reads.equals(0) + assert json.short.short.FastP.insert_size.peak.equals(347) + + //assert json.short.meta.metagenomic.equals(false) // Currently, this is "null". + assert json.short.meta.assembly.equals(false) + assert json.short.meta.hybrid.equals(false) + assert json.short.meta.single_end.equals(false) + assert json.short.meta.merge.equals(false) + assert json.short.meta.downsampled.equals(false) + + assert json.short.short.AssemblyCompleted.equals(true) + assert json.short.short.QUAST."0"."Total length (>= 0 bp)".equals("4949") + assert json.short.short.QUAST."0"."Largest contig".equals("4949") + assert json.short.short.QUAST."0"."GC (%)".equals("52.96") + assert json.short.short.QUAST."0"."Avg. coverage depth".equals("47") + + // Below two values should be empty + assert json.short.short.StarAMR."0"."Genotype".equals("None") + assert json.short.short.StarAMR."0"."Predicted Phenotype".equals("Susceptible") + assert json.short.short.StarAMR."0"."Genome Length".equals("4949") + + assert json.short.short.CheckM."0"."# genomes".equals("5656") + assert json.short.short.CheckM."0"."# markers".equals("56") + assert json.short.short.CheckM."0"."# marker sets".equals("24") + assert json.short.short.CheckM."0".Contamination.equals("0.00") + + assert json.short.short.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz") + + assert json.short.short.Abricate."0".RESISTANCE.equals("NoData") // All Abricate results for this are "NoData". + + def assembly_path = "$launchDir/results/Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" + assert path(assembly_path).exists() + + // parse assembly file + def assembly_header = path(assembly_path).linesGzip[0] + assert assembly_header.startsWith(">NODE_1_length_4949_cov_23.917254") // _pilon_pilon_pilon gets appended + + // compare IRIDA Next JSON output + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + + // output files + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1 + + // output metadata + assert iridanext_metadata.INX."QC Status" == "PASSED" + + } + + } + + test("Should run without failure.") { tag "succeed_assembly" From 351a8f11aeeccb7eac45e18443b8766d162b4364 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 28 Oct 2024 14:39:53 -0500 Subject: [PATCH 03/32] updated iridanext external name id --- assets/schema_input.json | 6 +++--- bin/report_summaries.py | 29 ++++++++++++++++++++-------- conf/irida_next.config | 2 +- main.nf | 32 +++++++++++++++++++++++-------- modules/local/report.nf | 6 +----- modules/local/report_aggregate.nf | 2 +- nextflow.config | 6 ++---- subworkflows/local/input_check.nf | 17 ++++++++-------- tests/main.nf.test | 2 +- 9 files changed, 63 insertions(+), 39 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index c6defb68..43321486 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,10 +9,10 @@ "properties": { "sample_name": { "type": "string", - "pattern": "^\\S+$", - "meta": ["irida_id"], + "pattern": "^[^\\s\\.]+$", + "meta": ["external_id"], "unique": true, - "errorMessage": "Sample name must be provided and cannot contain spaces" + "errorMessage": "This field cannot contain .iridanext_output." }, "sample": { "type": "string", diff --git a/bin/report_summaries.py b/bin/report_summaries.py index a250f753..db4f08ee 100755 --- a/bin/report_summaries.py +++ b/bin/report_summaries.py @@ -37,9 +37,11 @@ class JsonImport: __keep_keys = frozenset(__key_order.keys()) __delimiter = "\t" __key_delimiter = "." + __inx_irida_key = "meta.external_id" - def __init__(self, report_fp, output_name, sample_suffix): - self.tool_data = None # TODO set this in output of group tool fields + def __init__(self, report_fp, output_name, sample_suffix, inx_id_token): + self.inx_id_token = inx_id_token + self.tool_data = None self.output_name = output_name self.output_transposed = os.path.splitext(os.path.basename(self.output_name))[0] + "_transposed.tsv" self.output_dir = os.path.dirname(self.output_name) @@ -49,7 +51,7 @@ def __init__(self, report_fp, output_name, sample_suffix): self.flat_sample_string = sample_suffix self.data = self.ingest_report(self.report_fp) self.flat_data, self.common_fields, self.tool_fields, self.table = self.flatten_json(self.data) - self.output_indv_json(self.flat_data) + self.flat_data = self.output_indv_json(self.flat_data) self.output_flat_json(self.flat_data) self.write_table(self.table) @@ -233,7 +235,6 @@ def remove_prefix_id_fields(self, flattened_dict): top_level_keys.add(item_key) temp[item_key] = v - #self.tool_data = tool_data return reformatted_data, top_level_keys, tool_keys @@ -262,11 +263,22 @@ def output_indv_json(self, flattened_data): Args: flattened_data (json: Dict[sample_id: Dict[tool_info: value]]): """ + updated_items = dict() for k, v in flattened_data.items(): - with open(os.path.join(self.output_dir, k + self.flat_sample_string), "w") as output: + out_path = os.path.join(self.output_dir, k + self.flat_sample_string) + out_key = k + if inx_id := v.get(self.__inx_irida_key): + #! this field affects the identification of the irida next id being passed out of the pipeline + out_path = os.path.join(self.output_dir, k + self.inx_id_token + inx_id + self.flat_sample_string) + out_key = inx_id # this field must be overwritten for iridanext to identify the correct metdata field + + with open(out_path, "w") as output: json_data = json.dumps({k: v}, indent=2) output.write(json_data) + updated_items[out_key] = v + flattened_data = updated_items + return flattened_data def to_file(self): with open(self.output_name, "w") as out_file: @@ -291,15 +303,16 @@ def to_file(self): -def main_(args_in): +def main(args_in): default_samp_suffix = "_flat_sample.json" parser = argparse.ArgumentParser("Table Summary") parser.add_argument("-f", "--file-in", help="Path to the mikrokondo json summary") parser.add_argument("-s", "--sample-tag", help="Optional suffix and extension to name output samples.", default=default_samp_suffix) parser.add_argument("-o", "--out-file", help="output name plus the .tsv extension e.g. prefix.tsv") + parser.add_argument("-x", "--inx-id-token", help="A token to insert into the flattened json file names for separation of the irida next sample id.") args = parser.parse_args(args_in) if os.path.isfile(args.file_in): - JsonImport(args.file_in, args.out_file, args.sample_tag) + JsonImport(args.file_in, args.out_file, args.sample_tag, args.inx_id_token) else: sys.stderr.write(f"{args.file_in} does not exist.\n") sys.exit(-1) @@ -307,4 +320,4 @@ def main_(args_in): if __name__ == "__main__": # pass json file to program to parse it - main_(sys.argv[1:]) + main(sys.argv[1:]) diff --git a/conf/irida_next.config b/conf/irida_next.config index 567e570c..c2a03d41 100755 --- a/conf/irida_next.config +++ b/conf/irida_next.config @@ -11,7 +11,7 @@ iridanext { overwrite = true validate = false files { - idkey = "irida_id" + idkey = 'external_id' // Previously sample global = [ "**/FinalReports/Aggregated/Json/final_report.json", "**/FinalReports/Aggregated/Tables/final_report.tsv" diff --git a/main.nf b/main.nf index 24336b72..6c9b3631 100644 --- a/main.nf +++ b/main.nf @@ -42,9 +42,6 @@ if (params.help) { if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - - - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NAMED WORKFLOW FOR PIPELINE @@ -111,16 +108,35 @@ workflow MIKROKONDO { REPORT_AGGREGATE(REPORT.out.final_report) ch_versions = ch_versions.mix(REPORT_AGGREGATE.out.versions) - - // TODO need to add logic to merge this channel with a previous one to get its INX id updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{ sample -> + def inx_string_suffix = params.report_aggregate.inx_string_insertion def name_trim = sample.getName() def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length()) - tuple([ + def output_map = [ "id": trimmed_name, - "sample": trimmed_name], - sample) + "sample": trimmed_name, + "external_id": trimmed_name] + + def inx_sample_p = trimmed_name.indexOf(params.report_aggregate.inx_string_insertion) + if(inx_sample_p){ + if(trimmed_name[0.. versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/nextflow.config b/nextflow.config index 256766f3..c4a13e53 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,8 +48,6 @@ params { validationFailUnrecognisedParams = false // for the qcreport fields // SKIP options - // TODO need to add constants section - // TODO investigate usage of template scripts to replace mash modules skip_report = false skip_raw_read_metrics = false skip_version_gathering = false @@ -59,7 +57,7 @@ params { skip_checkm = false skip_depth_sampling = false // TODO have it mentioned that this should be turned off for metagenomic runs skip_ont_header_cleaning = true // TODO an awk script can likely replace this and be much faster at what it does... - skip_polishing = false // TODO make it clear this does not apply to Hybrid assembly + skip_polishing = false skip_species_classification = false skip_mlst = false skip_mobrecon = false @@ -147,7 +145,6 @@ params { version = false - // If a param in camel case is present nextflow automatically creates a kebab case parameter as well stage_in_mode = 'symlink' @@ -719,6 +716,7 @@ params { report_aggregate { sample_flat_suffix = ".flat_sample.json" + inx_string_insertion = ".iridanext_output." } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 1cd2787b..d95833a7 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -5,6 +5,9 @@ include { COMBINE_DATA } from '../../modules/local/combine_data.nf' include { fromSamplesheet } from 'plugin/nf-validation' + + + workflow INPUT_CHECK { main: @@ -69,7 +72,7 @@ def reset_combined_map(LinkedHashMap meta, sun.nio.fs.UnixPath f_reads, sun.nio. /*Re-format the data to make it similar to make it match the input format again */ - // TODO find a way to make this cleaner + def new_meta = meta new_meta.merge = true @@ -101,13 +104,11 @@ def check_file_exists(def file_path){ def format_reads(ArrayList sheet_data){ def meta = [:] def error_occured = false - if(sheet_data[1].irida_id != null){ - meta.irida_id = sheet_data[1].irida_id - meta.id = sheet_data[0] // id is first value - meta.sample = sheet_data[0] // Sample will be id currently - }else{ - meta.id = sheet_data[0] // id is first value - meta.sample = sheet_data[0] // Sample will be id currently + meta.id = sheet_data[0] // id is first value + meta.sample = sheet_data[0] // Sample will be id currently + meta.external_id = sheet_data[0] // This is duplicated to keep later scripting cleaner + if(sheet_data[1].external_id != null){ + meta.external_id = sheet_data[1].external_id } meta.hybrid = false diff --git a/tests/main.nf.test b/tests/main.nf.test index 5d741ee6..784fa7be 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -216,7 +216,7 @@ nextflow_pipeline { assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.iridanext_output.INX.flat_sample.json.gz" }.size() == 1 // output metadata assert iridanext_metadata.INX."QC Status" == "PASSED" From acdb88421e7840d2fb15be2ff4b1dc5444b6d41d Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 28 Oct 2024 14:50:39 -0500 Subject: [PATCH 04/32] updated changelog and docs --- CHANGELOG.md | 10 +++++++++- assets/schema_input.json | 6 +++--- docs/usage/usage.md | 12 ++++++------ 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d446f39..96f46bc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,10 +5,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased -### `Changed` +### `Added` - Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) +- Added a new field to the `schema_input.json` file to allow for sample ID's from external systems such as IRIDA Next: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) + +### `Changed` + +- Added a `sample_name` field, `sample` still exists but is used for different purposes [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) + +- RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) + ### `Updated` - Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123) diff --git a/assets/schema_input.json b/assets/schema_input.json index 43321486..01aeca1d 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,14 +7,14 @@ "items": { "type": "object", "properties": { - "sample_name": { + "sample": { "type": "string", "pattern": "^[^\\s\\.]+$", "meta": ["external_id"], "unique": true, "errorMessage": "This field cannot contain .iridanext_output." }, - "sample": { + "sample_name": { "type": "string", "pattern": "^\\S+$", "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next.", @@ -54,6 +54,6 @@ "unique": true } }, - "required": ["sample"] + "required": ["sample_name"] } } diff --git a/docs/usage/usage.md b/docs/usage/usage.md index b2cf6c6f..09792343 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -17,7 +17,7 @@ This pipeline requires sample files to be gzipped (symlinks may be problematic). ### Samplesheet (CSV) Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) contains the samples names and allows a user to combine read-sets based on that name if provided. The sample-sheet can utilize the following header fields: -- sample +- sample_name - fastq_1 - fastq_2 - long_reads @@ -28,31 +28,31 @@ Example layouts for different sample-sheets include: _Illumina paired-end data_ -|sample|fastq_1|fastq_2| +|sample_name|fastq_1|fastq_2| |------|-------|-------| |sample_name|path_to_forward_reads|path_to_reversed_reads| _Nanopore_ -|sample|long_reads| +|sample_name|long_reads| |------|----------| |sample_name|path_to_reads| _Hybrid Assembly_ -|sample|fastq_1|fastq_2|long_reads| +|sample_name|fastq_1|fastq_2|long_reads| |-------|-------|------|----------| |sample_name|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads| _Starting with assembly only_ -|sample|assembly| +|sample_name|assembly| |------|--------| |sample_name|path_to_assembly| _Example merging paired-end data_ -|sample|fastq_1|fastq_2| +|sample_name|fastq_1|fastq_2| |------|-------|-------| |my_sample|path_to_forward_reads_1|path_to_reversed_reads_1| |my_sample|path_to_forward_reads_2|path_to_reversed_reads_2| From dcdce6d77515dda3e947e09b917335f6e91a8bd4 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 28 Oct 2024 15:47:08 -0500 Subject: [PATCH 05/32] udpated samples sheet names --- tests/data/samplesheets/samplesheet-campy-staph.csv | 2 +- tests/data/samplesheets/samplesheet-small-assembly.csv | 2 +- tests/data/samplesheets/samplesheet-small-metagenomic.csv | 2 +- .../samplesheet-test-from-assemblies-listeria.csv | 2 +- .../samplesheet-test-from-assemblies-salmonella.csv | 2 +- .../samplesheets/samplesheet-test-from-assemblies-vibrio.csv | 2 +- tests/data/samplesheets/samplesheet-test-from-assemblies.csv | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/data/samplesheets/samplesheet-campy-staph.csv b/tests/data/samplesheets/samplesheet-campy-staph.csv index 203b4804..8bb1350f 100644 --- a/tests/data/samplesheets/samplesheet-campy-staph.csv +++ b/tests/data/samplesheets/samplesheet-campy-staph.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2,long_reads,assembly +sample_name,fastq_1,fastq_2,long_reads,assembly CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,, diff --git a/tests/data/samplesheets/samplesheet-small-assembly.csv b/tests/data/samplesheets/samplesheet-small-assembly.csv index bc658f43..8ea218c7 100644 --- a/tests/data/samplesheets/samplesheet-small-assembly.csv +++ b/tests/data/samplesheets/samplesheet-small-assembly.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2,long_reads,assembly +sample_name,fastq_1,fastq_2,long_reads,assembly short,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,, diff --git a/tests/data/samplesheets/samplesheet-small-metagenomic.csv b/tests/data/samplesheets/samplesheet-small-metagenomic.csv index 3e341b9b..d245aff8 100644 --- a/tests/data/samplesheets/samplesheet-small-metagenomic.csv +++ b/tests/data/samplesheets/samplesheet-small-metagenomic.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2,long_reads,assembly +sample_name,fastq_1,fastq_2,long_reads,assembly meta-small,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,, diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv index 69236531..7bc21370 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2,long_reads,assembly +sample_name,fastq_1,fastq_2,long_reads,assembly listeria_GCF_000196035,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/listeria/GCF_000196035.1_ASM19603v1_genomic.fna.gz diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv index 2526bbe5..d9ca7157 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2,long_reads,assembly +sample_name,fastq_1,fastq_2,long_reads,assembly salmonella_GCA_000008105,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/salmonella/GCA_000008105.1_ASM810v1_genomic.fna.gz diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv index 98a1f026..f475bdc8 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2,long_reads,assembly +sample_name,fastq_1,fastq_2,long_reads,assembly st_120,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv index 07a21039..4de3619c 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv @@ -1,2 +1,2 @@ -sample,fastq_1,fastq_2,long_reads,assembly -ecoli_GCA_000947975,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/ecoli/GCA_000947975.1_ASM94797v1_genomic.fna.gz \ No newline at end of file +sample_name,fastq_1,fastq_2,long_reads,assembly +ecoli_GCA_000947975,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/ecoli/GCA_000947975.1_ASM94797v1_genomic.fna.gz From fd4ea245dac56514ef17da7d8a23fe7f00b65a5c Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 28 Oct 2024 16:56:29 -0500 Subject: [PATCH 06/32] updated inx id parsing --- bin/report_summaries.py | 5 +++-- main.nf | 22 +++++++--------------- subworkflows/local/input_check.nf | 1 - 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/bin/report_summaries.py b/bin/report_summaries.py index db4f08ee..8ee4c064 100755 --- a/bin/report_summaries.py +++ b/bin/report_summaries.py @@ -243,7 +243,7 @@ def ingest_report(self, report_fp): report_fp: File path to the json report to be read in """ data = None - with open(report_fp, "r", encoding="utf8") as report: + with open(report_fp, "r") as report: data = json.load(report) return data @@ -267,7 +267,8 @@ def output_indv_json(self, flattened_data): for k, v in flattened_data.items(): out_path = os.path.join(self.output_dir, k + self.flat_sample_string) out_key = k - if inx_id := v.get(self.__inx_irida_key): + if v.get(self.__inx_irida_key) != k: + inx_id = v[self.__inx_irida_key] #! this field affects the identification of the irida next id being passed out of the pipeline out_path = os.path.join(self.output_dir, k + self.inx_id_token + inx_id + self.flat_sample_string) out_key = inx_id # this field must be overwritten for iridanext to identify the correct metdata field diff --git a/main.nf b/main.nf index 6c9b3631..f32e40c1 100644 --- a/main.nf +++ b/main.nf @@ -118,22 +118,14 @@ workflow MIKROKONDO { "sample": trimmed_name, "external_id": trimmed_name] - def inx_sample_p = trimmed_name.indexOf(params.report_aggregate.inx_string_insertion) + def inx_sample_p = trimmed_name.contains(inx_string_suffix) + println "inx_sample_p: ${}" if(inx_sample_p){ - if(trimmed_name[0.. tuple(meta.id[0], meta[0]) } - if(params.opt_platforms.ont == params.platform && params.nanopore_chemistry == null){ exit 1, "ERROR: Nanopore data was selected without a model being specified." } From 0d81ebfd8051d8902dbd0900789c1738f323e272 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 28 Oct 2024 17:02:41 -0500 Subject: [PATCH 07/32] updated sample sheet parsing --- main.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index f32e40c1..b2113ebc 100644 --- a/main.nf +++ b/main.nf @@ -118,9 +118,8 @@ workflow MIKROKONDO { "sample": trimmed_name, "external_id": trimmed_name] - def inx_sample_p = trimmed_name.contains(inx_string_suffix) - println "inx_sample_p: ${}" - if(inx_sample_p){ + def inx_sample_p = trimmed_name.indexOf(inx_string_suffix) + if(inx_sample_p > 0){ // -1 was not being evaluated as true def inx_id = trimmed_name.substring(inx_sample_p + inx_string_suffix.length(), trimmed_name.length()) trimmed_name = trimmed_name.substring(0, inx_sample_p) output_map.id = trimmed_name From c036fb5025a7cfc4ced6b34227dd601cacf57620 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 29 Oct 2024 12:48:57 -0500 Subject: [PATCH 08/32] updated tests --- CHANGELOG.md | 2 + assets/schema_input.json | 4 +- ...st-from-assemblies-vibrio-stupid-names.csv | 2 + tests/pipelines/main.from_assemblies.nf.test | 62 +++++++++++++++++++ 4 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index 96f46bc5..58aa5c80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) +- Sample names (`sample_name` field) can no longer begin with a period. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) + ### `Updated` - Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123) diff --git a/assets/schema_input.json b/assets/schema_input.json index 01aeca1d..2ca6a15e 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -16,8 +16,8 @@ }, "sample_name": { "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next.", + "pattern": "^[^\\.]\\S+$", + "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.", "meta": ["id"] }, "fastq_1": { diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv new file mode 100644 index 00000000..c5215eac --- /dev/null +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv @@ -0,0 +1,2 @@ +sample,sample_name,fastq_1,fastq_2,long_reads,assembly +INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz diff --git a/tests/pipelines/main.from_assemblies.nf.test b/tests/pipelines/main.from_assemblies.nf.test index 99946300..652f21c2 100644 --- a/tests/pipelines/main.from_assemblies.nf.test +++ b/tests/pipelines/main.from_assemblies.nf.test @@ -796,4 +796,66 @@ nextflow_pipeline { } } + test("Test Stupid Name in Input Sheet") { + tag "from_assemblies_stupidnames" + + when { + params { + // Need to update with 7 gene when complete + input = "$baseDir/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv" + outdir = "results" + + platform = "illumina" + + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/mash.msh" + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + kraken2_db = "$baseDir/tests/data/kraken2/test" + lx_allele_database = "$baseDir/tests/data/databases/locidex_dbs" + qt_min_contig_length = 1 + + skip_bakta = true + skip_mobrecon = true + skip_checkm = true + skip_raw_read_metrics = true + skip_polishing = true + skip_mlst = true + skip_version_gathering = true + skip_staramr = true + skip_length_filtering_contigs = true + + skip_subtyping = false + skip_allele_calling = false + lx_report_max_stop = 100 + max_memory = "2.GB" + max_cpus = 1 + } + } + + then { + assert workflow.success + assert path("$launchDir/results").exists() + + // Compare IRIDANext json + assert path("$launchDir/results/iridanext.output.json").exists() + def iridanext_json = path("$launchDir/results/iridanext.output.json").json + def iridanext_global = iridanext_json.files.global + def iridanext_samples = iridanext_json.files.samples + def iridanext_metadata = iridanext_json.metadata.samples + def vibrio_metadata = iridanext_metadata.INX + + // Output files + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 + assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 + + assert vibrio_metadata.locidex_db_name == "Vibrio cholerae" + assert vibrio_metadata.locidex_db_date == "2024-07-30" + assert vibrio_metadata.locidex_db_version == "1.0.0" + assert vibrio_metadata.total_loci == 7 + assert vibrio_metadata.count_loci_found == 6 + assert vibrio_metadata.count_loci_missing == 1 + + } + } + + } From 6cb6d8c9cfc8e3efa5f8b778b62ed7cef39006b5 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 31 Oct 2024 13:08:49 -0500 Subject: [PATCH 09/32] updating commits for feedback --- assets/schema_input.json | 6 +++--- docs/usage/usage.md | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 2ca6a15e..68802293 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,14 +7,14 @@ "items": { "type": "object", "properties": { - "sample": { + "sample_name": { "type": "string", "pattern": "^[^\\s\\.]+$", "meta": ["external_id"], "unique": true, "errorMessage": "This field cannot contain .iridanext_output." }, - "sample_name": { + "sample": { "type": "string", "pattern": "^[^\\.]\\S+$", "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.", @@ -54,6 +54,6 @@ "unique": true } }, - "required": ["sample_name"] + "required": ["sample"] } } diff --git a/docs/usage/usage.md b/docs/usage/usage.md index 09792343..34aeec1c 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -17,7 +17,7 @@ This pipeline requires sample files to be gzipped (symlinks may be problematic). ### Samplesheet (CSV) Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) contains the samples names and allows a user to combine read-sets based on that name if provided. The sample-sheet can utilize the following header fields: -- sample_name +- sample - fastq_1 - fastq_2 - long_reads @@ -28,31 +28,31 @@ Example layouts for different sample-sheets include: _Illumina paired-end data_ -|sample_name|fastq_1|fastq_2| +|sample|fastq_1|fastq_2| |------|-------|-------| -|sample_name|path_to_forward_reads|path_to_reversed_reads| +|sample|path_to_forward_reads|path_to_reversed_reads| _Nanopore_ -|sample_name|long_reads| +|sample|long_reads| |------|----------| -|sample_name|path_to_reads| +|sample|path_to_reads| _Hybrid Assembly_ -|sample_name|fastq_1|fastq_2|long_reads| +|sample|fastq_1|fastq_2|long_reads| |-------|-------|------|----------| -|sample_name|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads| +|sample|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads| _Starting with assembly only_ -|sample_name|assembly| +|sample|assembly| |------|--------| -|sample_name|path_to_assembly| +|sample|path_to_assembly| _Example merging paired-end data_ -|sample_name|fastq_1|fastq_2| +|sample|fastq_1|fastq_2| |------|-------|-------| |my_sample|path_to_forward_reads_1|path_to_reversed_reads_1| |my_sample|path_to_forward_reads_2|path_to_reversed_reads_2| From db34308797a4043a138ece4a86bc9b0351a94d86 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 31 Oct 2024 13:20:55 -0500 Subject: [PATCH 10/32] updated samplesheets --- tests/data/samplesheets/samplesheet-campy-staph.csv | 2 +- tests/data/samplesheets/samplesheet-small-assembly.csv | 2 +- tests/data/samplesheets/samplesheet-small-metagenomic.csv | 2 +- .../samplesheets/samplesheet-test-from-assemblies-listeria.csv | 2 +- .../samplesheet-test-from-assemblies-salmonella.csv | 2 +- .../samplesheet-test-from-assemblies-vibrio-stupid-names.csv | 2 +- .../samplesheets/samplesheet-test-from-assemblies-vibrio.csv | 2 +- tests/data/samplesheets/samplesheet-test-from-assemblies.csv | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/data/samplesheets/samplesheet-campy-staph.csv b/tests/data/samplesheets/samplesheet-campy-staph.csv index 8bb1350f..203b4804 100644 --- a/tests/data/samplesheets/samplesheet-campy-staph.csv +++ b/tests/data/samplesheets/samplesheet-campy-staph.csv @@ -1,2 +1,2 @@ -sample_name,fastq_1,fastq_2,long_reads,assembly +sample,fastq_1,fastq_2,long_reads,assembly CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,, diff --git a/tests/data/samplesheets/samplesheet-small-assembly.csv b/tests/data/samplesheets/samplesheet-small-assembly.csv index 8ea218c7..bc658f43 100644 --- a/tests/data/samplesheets/samplesheet-small-assembly.csv +++ b/tests/data/samplesheets/samplesheet-small-assembly.csv @@ -1,2 +1,2 @@ -sample_name,fastq_1,fastq_2,long_reads,assembly +sample,fastq_1,fastq_2,long_reads,assembly short,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,, diff --git a/tests/data/samplesheets/samplesheet-small-metagenomic.csv b/tests/data/samplesheets/samplesheet-small-metagenomic.csv index d245aff8..3e341b9b 100644 --- a/tests/data/samplesheets/samplesheet-small-metagenomic.csv +++ b/tests/data/samplesheets/samplesheet-small-metagenomic.csv @@ -1,2 +1,2 @@ -sample_name,fastq_1,fastq_2,long_reads,assembly +sample,fastq_1,fastq_2,long_reads,assembly meta-small,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,, diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv index 7bc21370..69236531 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv @@ -1,2 +1,2 @@ -sample_name,fastq_1,fastq_2,long_reads,assembly +sample,fastq_1,fastq_2,long_reads,assembly listeria_GCF_000196035,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/listeria/GCF_000196035.1_ASM19603v1_genomic.fna.gz diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv index d9ca7157..2526bbe5 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv @@ -1,2 +1,2 @@ -sample_name,fastq_1,fastq_2,long_reads,assembly +sample,fastq_1,fastq_2,long_reads,assembly salmonella_GCA_000008105,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/salmonella/GCA_000008105.1_ASM810v1_genomic.fna.gz diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv index c5215eac..eb265562 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv @@ -1,2 +1,2 @@ -sample,sample_name,fastq_1,fastq_2,long_reads,assembly +sample,sample,fastq_1,fastq_2,long_reads,assembly INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv index f475bdc8..98a1f026 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv @@ -1,2 +1,2 @@ -sample_name,fastq_1,fastq_2,long_reads,assembly +sample,fastq_1,fastq_2,long_reads,assembly st_120,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv index 4de3619c..ba8e235d 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv @@ -1,2 +1,2 @@ -sample_name,fastq_1,fastq_2,long_reads,assembly +sample,fastq_1,fastq_2,long_reads,assembly ecoli_GCA_000947975,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/ecoli/GCA_000947975.1_ASM94797v1_genomic.fna.gz From 0c6e6d1630a725527ab1b773c4bb5f074e9fcd34 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 31 Oct 2024 13:55:52 -0500 Subject: [PATCH 11/32] updated sample sheet name --- .../samplesheet-test-from-assemblies-vibrio-stupid-names.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv index eb265562..0a1d49a8 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv @@ -1,2 +1,2 @@ -sample,sample,fastq_1,fastq_2,long_reads,assembly +sample_name,sample,fastq_1,fastq_2,long_reads,assembly INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz From d1e5609f61223017a7d6ce2d14bec15015ba1059 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 31 Oct 2024 16:25:26 -0500 Subject: [PATCH 12/32] updated external_id parsing, tests will fail as path locations need to be updated --- bin/report_summaries.py | 17 +++++++++-------- main.nf | 13 ++----------- modules/local/report_aggregate.nf | 4 ++-- nextflow.config | 1 - 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/bin/report_summaries.py b/bin/report_summaries.py index 8ee4c064..7686082c 100755 --- a/bin/report_summaries.py +++ b/bin/report_summaries.py @@ -39,8 +39,7 @@ class JsonImport: __key_delimiter = "." __inx_irida_key = "meta.external_id" - def __init__(self, report_fp, output_name, sample_suffix, inx_id_token): - self.inx_id_token = inx_id_token + def __init__(self, report_fp, output_name, sample_suffix): self.tool_data = None self.output_name = output_name self.output_transposed = os.path.splitext(os.path.basename(self.output_name))[0] + "_transposed.tsv" @@ -265,13 +264,16 @@ def output_indv_json(self, flattened_data): """ updated_items = dict() for k, v in flattened_data.items(): - out_path = os.path.join(self.output_dir, k + self.flat_sample_string) + #out_path = os.path.join(self.output_dir, k + self.flat_sample_string) out_key = k + sample_dir = k if v.get(self.__inx_irida_key) != k: - inx_id = v[self.__inx_irida_key] + sample_dir = v[self.__inx_irida_key] #! this field affects the identification of the irida next id being passed out of the pipeline - out_path = os.path.join(self.output_dir, k + self.inx_id_token + inx_id + self.flat_sample_string) - out_key = inx_id # this field must be overwritten for iridanext to identify the correct metdata field + out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field + out_path = os.path.join(self.output_dir, sample_dir, k + self.flat_sample_string) + if not os.is_dir(out_path): # Check for directory existence, as it will still exist on pipeline resumes + os.mkdir(out_path) with open(out_path, "w") as output: json_data = json.dumps({k: v}, indent=2) @@ -310,10 +312,9 @@ def main(args_in): parser.add_argument("-f", "--file-in", help="Path to the mikrokondo json summary") parser.add_argument("-s", "--sample-tag", help="Optional suffix and extension to name output samples.", default=default_samp_suffix) parser.add_argument("-o", "--out-file", help="output name plus the .tsv extension e.g. prefix.tsv") - parser.add_argument("-x", "--inx-id-token", help="A token to insert into the flattened json file names for separation of the irida next sample id.") args = parser.parse_args(args_in) if os.path.isfile(args.file_in): - JsonImport(args.file_in, args.out_file, args.sample_tag, args.inx_id_token) + JsonImport(args.file_in, args.out_file, args.sample_tag) else: sys.stderr.write(f"{args.file_in} does not exist.\n") sys.exit(-1) diff --git a/main.nf b/main.nf index b2113ebc..cd761b7e 100644 --- a/main.nf +++ b/main.nf @@ -110,22 +110,13 @@ workflow MIKROKONDO { updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{ sample -> - def inx_string_suffix = params.report_aggregate.inx_string_insertion def name_trim = sample.getName() def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length()) + def external_id_name = trimmed_name.getParent().getBaseName() def output_map = [ "id": trimmed_name, "sample": trimmed_name, - "external_id": trimmed_name] - - def inx_sample_p = trimmed_name.indexOf(inx_string_suffix) - if(inx_sample_p > 0){ // -1 was not being evaluated as true - def inx_id = trimmed_name.substring(inx_sample_p + inx_string_suffix.length(), trimmed_name.length()) - trimmed_name = trimmed_name.substring(0, inx_sample_p) - output_map.id = trimmed_name - output_map.sample = trimmed_name - output_map.external_id = inx_id - } + "external_id": external_id_name] tuple(output_map, sample) } diff --git a/modules/local/report_aggregate.nf b/modules/local/report_aggregate.nf index 3a2cf787..b4a6e180 100644 --- a/modules/local/report_aggregate.nf +++ b/modules/local/report_aggregate.nf @@ -14,13 +14,13 @@ process REPORT_AGGREGATE{ path("final_report.tsv"), emit: final_report path("final_report_transposed.tsv"), emit: final_report_transposed path("final_report_flattened.json"), emit: flattened_files - path("*${sample_flat_suffix}"), emit: flat_samples + path("*/*${sample_flat_suffix}"), emit: flat_samples path "versions.yml", emit: versions script: sample_flat_suffix = params.report_aggregate.sample_flat_suffix """ - report_summaries.py -f ${summary_report} -o final_report.tsv -s ${sample_flat_suffix} -x ${params.report_aggregate.inx_string_insertion} + report_summaries.py -f ${summary_report} -o final_report.tsv -s ${sample_flat_suffix} cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/nextflow.config b/nextflow.config index c4a13e53..026a2444 100644 --- a/nextflow.config +++ b/nextflow.config @@ -716,7 +716,6 @@ params { report_aggregate { sample_flat_suffix = ".flat_sample.json" - inx_string_insertion = ".iridanext_output." } From a48fb95052cad75fa8857db8326688a74c289263 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 1 Nov 2024 13:26:31 -0500 Subject: [PATCH 13/32] updated output of flattened sample reports --- bin/report_summaries.py | 8 +++++--- main.nf | 2 +- tests/main.nf.test | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bin/report_summaries.py b/bin/report_summaries.py index 7686082c..73b17617 100755 --- a/bin/report_summaries.py +++ b/bin/report_summaries.py @@ -271,9 +271,11 @@ def output_indv_json(self, flattened_data): sample_dir = v[self.__inx_irida_key] #! this field affects the identification of the irida next id being passed out of the pipeline out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field - out_path = os.path.join(self.output_dir, sample_dir, k + self.flat_sample_string) - if not os.is_dir(out_path): # Check for directory existence, as it will still exist on pipeline resumes - os.mkdir(out_path) + + out_dir = os.path.join(self.output_dir, sample_dir) + out_path = os.path.join(out_dir, k + self.flat_sample_string) + if not os.path.isdir(out_dir): # Check for directory existence, as it will still exist on pipeline resumes + os.mkdir(out_dir) with open(out_path, "w") as output: json_data = json.dumps({k: v}, indent=2) diff --git a/main.nf b/main.nf index cd761b7e..ad2e991c 100644 --- a/main.nf +++ b/main.nf @@ -112,7 +112,7 @@ workflow MIKROKONDO { sample -> def name_trim = sample.getName() def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length()) - def external_id_name = trimmed_name.getParent().getBaseName() + def external_id_name = sample.getParent().getBaseName() //Calling getBaseName after getParent does not work as the output items is a string not Path def output_map = [ "id": trimmed_name, "sample": trimmed_name, diff --git a/tests/main.nf.test b/tests/main.nf.test index 784fa7be..5d741ee6 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -216,7 +216,7 @@ nextflow_pipeline { assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.iridanext_output.INX.flat_sample.json.gz" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1 // output metadata assert iridanext_metadata.INX."QC Status" == "PASSED" From 39c8505824de4d5dbd577b7674858dc121103ce3 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 1 Nov 2024 13:27:41 -0500 Subject: [PATCH 14/32] fixed erroneous comment --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ad2e991c..17f643c7 100644 --- a/main.nf +++ b/main.nf @@ -112,7 +112,7 @@ workflow MIKROKONDO { sample -> def name_trim = sample.getName() def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length()) - def external_id_name = sample.getParent().getBaseName() //Calling getBaseName after getParent does not work as the output items is a string not Path + def external_id_name = sample.getParent().getBaseName() def output_map = [ "id": trimmed_name, "sample": trimmed_name, From 14653fbae6871f0cfd0f226da4d9b1feeb24ab84 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 5 Nov 2024 16:10:13 -0600 Subject: [PATCH 15/32] updated sample field orders --- assets/schema_input.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 68802293..068bf113 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,18 +7,18 @@ "items": { "type": "object", "properties": { - "sample_name": { + "sample": { "type": "string", - "pattern": "^[^\\s\\.]+$", - "meta": ["external_id"], + "pattern": "^[^\\.]\\S+$", + "meta": ["id"], "unique": true, - "errorMessage": "This field cannot contain .iridanext_output." + "errorMessage": "Sample name to be used in report generation." }, - "sample": { + "sample_name": { "type": "string", "pattern": "^[^\\.]\\S+$", "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.", - "meta": ["id"] + "meta": ["external_id"] }, "fastq_1": { "type": "string", From 9c0bad45a544a8e1f8d84c48418b6d2830b9eacc Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 5 Nov 2024 16:29:12 -0600 Subject: [PATCH 16/32] updated logic for renaming sample id --- subworkflows/local/input_check.nf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index b5d2ebc3..c8f9a39b 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -19,7 +19,11 @@ workflow INPUT_CHECK { parameters_schema: 'nextflow_schema.json', skip_duplicate_check: true).map { // Create grouping value - meta -> tuple(meta.id[0], meta[0]) + meta -> println meta + if (meta[0].external_id != null) { + meta[0].id = meta[0].external_id + } + tuple(meta[0].id, meta[0]) } if(params.opt_platforms.ont == params.platform && params.nanopore_chemistry == null){ From c8827fe61116be560d84b1c6301bcae11cb5a41c Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 6 Nov 2024 13:38:03 -0600 Subject: [PATCH 17/32] updated sample parsing --- assets/schema_input.json | 7 +++---- nextflow.config | 3 ++- subworkflows/local/input_check.nf | 29 +++++++++++++++++++++++++++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 068bf113..fe23e4c3 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,15 +9,14 @@ "properties": { "sample": { "type": "string", - "pattern": "^[^\\.]\\S+$", + "pattern": "^\\S+$", "meta": ["id"], "unique": true, - "errorMessage": "Sample name to be used in report generation." + "errorMessage": "Sample name to be used in report generation. Invalid characters are replaces with underscores." }, "sample_name": { "type": "string", - "pattern": "^[^\\.]\\S+$", - "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.", + "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Invalid characters will be replaced with underscores.", "meta": ["external_id"] }, "fastq_1": { diff --git a/nextflow.config b/nextflow.config index 026a2444..86c15d4a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,10 +44,11 @@ params { show_hidden_params = false validationS3PathCheck = true validationShowHiddenParams = false - validationSchemaIgnoreParams = '__in_iridanext,rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' + validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler' validationFailUnrecognisedParams = false // for the qcreport fields // SKIP options + skip_read_merging = true skip_report = false skip_raw_read_metrics = false skip_version_gathering = false diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index c8f9a39b..c9011aaf 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -14,15 +14,40 @@ workflow INPUT_CHECK { versions = Channel.empty() def sample_sheet = params.input + + // Thank you snvphylnfc for the ideas :) + // https://github.com/phac-nml/snvphylnfc/blob/f1e5fae76af276acf0a8c98174978cb21ca5d7e0/workflows/snvphylnfc.nf#L98-L109 + def processedIDs = [] as Set + reads_in = Channel.fromSamplesheet( "input", // apparentely input maps to params.input... parameters_schema: 'nextflow_schema.json', skip_duplicate_check: true).map { // Create grouping value - meta -> println meta + meta -> + // Remove any unallowed charactars in the meta.id field + meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') + if (meta[0].external_id != null) { - meta[0].id = meta[0].external_id + // remove any charactars in the external_id that should not be used + meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') } + + if(processedIDs.contains(meta.id) && params.skip_read_merging){ + // If the id is already contained and read merging is not to be + // performed, then we make the id's unique to proceed with processing + // read merging is set to false by default, so that when it is run + // in IRIDANext reads are only merged in irida next + while (processedIDs.contains(meta.id)) { + meta.id = "${meta.id}_${meta.external_id}" + } + }else{ + // Set the external id to the input ID. + meta[0].external_id = meta[0].id + } + + + processedIDs << meta.id tuple(meta[0].id, meta[0]) } From db5f420e82e289cd596817f48772b2e3323438a4 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 6 Nov 2024 13:56:03 -0600 Subject: [PATCH 18/32] updated docs, changelog and nextflow_schema.json --- CHANGELOG.md | 2 ++ docs/usage/usage.md | 3 +++ nextflow_schema.json | 6 ++++++ 3 files changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58aa5c80..98c6172b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added a new field to the `schema_input.json` file to allow for sample ID's from external systems such as IRIDA Next: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) +- Incorporated a `--skip_read_merging` parameter to prevent read merging [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) + ### `Changed` - Added a `sample_name` field, `sample` still exists but is used for different purposes [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) diff --git a/docs/usage/usage.md b/docs/usage/usage.md index 34aeec1c..eff83863 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -23,6 +23,7 @@ Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) con - long_reads - assembly +> **Note:** Illegal characters (e.g. characters that match the expression [^A-Za-z0-9_.\-] ) in the sample name will be replaced with underscores. Example layouts for different sample-sheets include: @@ -96,6 +97,8 @@ _Example merging paired-end data_ Numerous steps within mikrokondo can be turned off without compromising the stability of the pipeline. This skip options can reduce run-time of the pipeline or allow for completion of the pipeline despite errors. ** All of the above options can be turned on by entering `--{skip_option} true` in the command line arguments to the pipeline (where optional parameters can be added)** + +- `--skip_read_merging`: Do not merge reads, if duplicate sample names are present the names will be made unique. - `--skip_abricate`: turn off abricate AMR detection - `--skip_bakta`: turn off bakta annotation pipeline (generally a slow step, requiring a database to be specified). - `--skip_checkm`: used as part of the contamination detection within mikrokondo, its run time and resource usage can be quite lengthy. diff --git a/nextflow_schema.json b/nextflow_schema.json index 3de0abcb..70bd6ab5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -380,6 +380,12 @@ "type": "boolean", "description": "Do not enter the subtyping workflow, e.g. ECTyper, SISTR etc will not be ran." }, + "skip_read_merging": { + "type": "boolean", + "default": true, + "description": "Do not merge reads", + "hidden": true + }, "skip_bakta": { "type": "boolean", "default": true, From 52af4a9ed5839a39402993b91577373d2432898c Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 6 Nov 2024 15:06:39 -0600 Subject: [PATCH 19/32] updated test cases --- tests/main.nf.test | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/main.nf.test b/tests/main.nf.test index 5d741ee6..ef559da2 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -209,17 +209,17 @@ nextflow_pipeline { // output files assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 - assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 + assert iridanext_samples.short.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1 // output metadata - assert iridanext_metadata.INX."QC Status" == "PASSED" + assert iridanext_metadata.short."QC Status" == "PASSED" } From eb759696e4d2ac85fd317047c18b5500a7f569a4 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 6 Nov 2024 16:29:03 -0600 Subject: [PATCH 20/32] updating inputcheck tests --- assets/schema_input.json | 1 - modules/local/combine_data.nf | 8 ++++---- subworkflows/local/input_check.nf | 12 +++--------- tests/data/samplesheets/samplesheet-merge-test.csv | 4 ++++ 4 files changed, 11 insertions(+), 14 deletions(-) create mode 100644 tests/data/samplesheets/samplesheet-merge-test.csv diff --git a/assets/schema_input.json b/assets/schema_input.json index fe23e4c3..fd8b8c2a 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -11,7 +11,6 @@ "type": "string", "pattern": "^\\S+$", "meta": ["id"], - "unique": true, "errorMessage": "Sample name to be used in report generation. Invalid characters are replaces with underscores." }, "sample_name": { diff --git a/modules/local/combine_data.nf b/modules/local/combine_data.nf index cf76dded..c03d2083 100644 --- a/modules/local/combine_data.nf +++ b/modules/local/combine_data.nf @@ -20,16 +20,16 @@ process COMBINE_DATA{ def fields_merge = meta.fields_merge if(fastq_1){ - cmd_ << "cat ${meta.fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;" + cmd_ << "cat ${fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;" } if(fastq_2){ - cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;" + cmd_ << "cat ${fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;" } if(long_reads){ - cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;" + cmd_ << "cat ${long_reads.join(' ')} > out/${prefix}.merged.fastq.gz;" } if(assembly){ - cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;" + cmd_ << "cat ${assembly.join(' ')} > out/${prefix}.merged.fastq.gz;" } def cmd = cmd_.join("\n") // creating dummy outputs so that all outputs exist for any scenario diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index c9011aaf..23668c95 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -6,8 +6,6 @@ include { COMBINE_DATA } from '../../modules/local/combine_data.nf' include { fromSamplesheet } from 'plugin/nf-validation' - - workflow INPUT_CHECK { main: @@ -96,7 +94,7 @@ workflow INPUT_CHECK { versions = versions // channel: [ versions.yml ] } -def reset_combined_map(LinkedHashMap meta, sun.nio.fs.UnixPath f_reads, sun.nio.fs.UnixPath r_reads, sun.nio.fs.UnixPath long_reads, sun.nio.fs.UnixPath assembly){ +def reset_combined_map(LinkedHashMap meta, Path f_reads, Path r_reads, Path long_reads, Path assembly){ /*Re-format the data to make it similar to make it match the input format again */ @@ -124,7 +122,7 @@ def reset_combined_map(LinkedHashMap meta, sun.nio.fs.UnixPath f_reads, sun.nio. def check_file_exists(def file_path){ if(!file(file_path).exists()){ - exit 1, "ERROR: Please check input samplesheet -> $file_path does not exist. If your file in you sample sheet does not exist make sure you do not have spaces in your path name." + exit 1, "ERROR: Please check input samplesheet -> $file_path does not exist. Check that you do not have spaces in your path." } return true } @@ -134,10 +132,6 @@ def format_reads(ArrayList sheet_data){ def error_occured = false meta.id = sheet_data[0] // id is first value meta.sample = sheet_data[0] // Sample will be id currently - meta.external_id = sheet_data[0] // This is duplicated to keep later scripting cleaner - if(sheet_data[1].external_id != null){ - meta.external_id = sheet_data[1].external_id - } meta.hybrid = false meta.assembly = false @@ -222,7 +216,7 @@ def group_reads(ArrayList read_data){ reads_combine[item] = [] } if(group[item] && check_file_exists(group[item])){ - reads_combine[item] << group[item] + reads_combine[item] << file(group[item]) } } } diff --git a/tests/data/samplesheets/samplesheet-merge-test.csv b/tests/data/samplesheets/samplesheet-merge-test.csv new file mode 100644 index 00000000..1d275345 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-merge-test.csv @@ -0,0 +1,4 @@ +sample,fastq_1,fastq_2,long_reads,assembly +CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,, +CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,, +un-merged,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,, From 45ce5a2ee59545bf8f15da5c0356679f1b70ae2f Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 6 Nov 2024 16:29:20 -0600 Subject: [PATCH 21/32] added missing files --- .../local/input_check/input_check.nf.test | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tests/subworkflows/local/input_check/input_check.nf.test diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test new file mode 100644 index 00000000..beccd3f3 --- /dev/null +++ b/tests/subworkflows/local/input_check/input_check.nf.test @@ -0,0 +1,76 @@ +nextflow_workflow { + name "Test workflow INPUT_CHECK" + script "subworkflows/local/input_check.nf" + workflow "INPUT_CHECK" + tag "subworkflow" + tag "input_check" + + test("Test input check") { + tag "pass_input_screen" + + when { + + params { + input = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/samplesheets/samplesheet-campy-staph.csv" + outdir = "results" + min_reads = 1 + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + assert workflow.success + assert workflow.out.reads == [[['id':'CSE', 'sample':'CSE', 'hybrid':false, 'assembly':false, 'downsampled':false, 'single_end':false, 'merge':false], ['/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz', '/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz']]] + } + } + + test("Test reads are merged") { + tag "pass_merge_reads" + + when { + + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-merge-test.csv" + outdir = "results" + min_reads = 1 + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + skip_read_merging = false + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + assert workflow.success + assert workflow.out.reads[0][0].id == 'CSE' + assert workflow.out.reads[0][0].merge == true + assert workflow.out.reads[0][1][0].endsWith("CSE_R1.merged.fastq.gz") + assert workflow.out.reads[0][1][1].endsWith("CSE_R2.merged.fastq.gz") + + assert workflow.out.reads[1][0].id == 'un-merged' + assert workflow.out.reads[1][0].merge == false + assert workflow.out.reads[1][1][0].endsWith("metagenomic_reads1.fq.gz") + assert workflow.out.reads[1][1][1].endsWith("metagenomic_reads2.fq.gz") + } + + } + +} From eec62b329391ca5543b9a67e820c9a4afc806635 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 6 Nov 2024 16:46:59 -0600 Subject: [PATCH 22/32] updated tests --- .../samplesheets/samplesheet-set-ext-id.csv | 5 +++ .../local/input_check/input_check.nf.test | 32 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tests/data/samplesheets/samplesheet-set-ext-id.csv diff --git a/tests/data/samplesheets/samplesheet-set-ext-id.csv b/tests/data/samplesheets/samplesheet-set-ext-id.csv new file mode 100644 index 00000000..e0d02480 --- /dev/null +++ b/tests/data/samplesheets/samplesheet-set-ext-id.csv @@ -0,0 +1,5 @@ +sample,sample_name,fastq_1,fastq_2,long_reads,assembly +CSE,better.faster.stronger.name,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,, +CSE2,an even stronger name!,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,, +unique2,this is getting ridiculous,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,, +unique3,this is getting ridiculous,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads.fastq,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq,, diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test index beccd3f3..6982dff5 100644 --- a/tests/subworkflows/local/input_check/input_check.nf.test +++ b/tests/subworkflows/local/input_check/input_check.nf.test @@ -73,4 +73,36 @@ nextflow_workflow { } + test("Test external id is set") { + tag "pass_set_external" + + when { + + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-set-ext-id.csv" + outdir = "results" + min_reads = 1 + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + skip_read_merging = false + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + assert workflow.success + println workflow.out.reads + //assert workflow.out.reads[0][0].id == "better_faster_stronger_name" + } + + } + } From 733db44548d87d15a411c21d4c8b922031dfe047 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 7 Nov 2024 12:47:50 -0600 Subject: [PATCH 23/32] fixed failing tests --- subworkflows/local/input_check.nf | 4 ++++ tests/subworkflows/local/input_check/input_check.nf.test | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 23668c95..462f7c2d 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -29,8 +29,11 @@ workflow INPUT_CHECK { if (meta[0].external_id != null) { // remove any charactars in the external_id that should not be used meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') + }else{ + meta[0].external_id = meta[0].id } + if(processedIDs.contains(meta.id) && params.skip_read_merging){ // If the id is already contained and read merging is not to be // performed, then we make the id's unique to proceed with processing @@ -132,6 +135,7 @@ def format_reads(ArrayList sheet_data){ def error_occured = false meta.id = sheet_data[0] // id is first value meta.sample = sheet_data[0] // Sample will be id currently + meta.external_id = sheet_data[1].external_id meta.hybrid = false meta.assembly = false diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test index 6982dff5..8fff14fa 100644 --- a/tests/subworkflows/local/input_check/input_check.nf.test +++ b/tests/subworkflows/local/input_check/input_check.nf.test @@ -98,6 +98,8 @@ nextflow_workflow { } then { + + // TODO beef up assertions assert workflow.success println workflow.out.reads //assert workflow.out.reads[0][0].id == "better_faster_stronger_name" @@ -105,4 +107,6 @@ nextflow_workflow { } + // TODO add test case for making read ids unique + } From 71260a99ab1334d0ffcafb008e673dce804c973a Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 7 Nov 2024 14:19:00 -0600 Subject: [PATCH 24/32] updated tests --- subworkflows/local/input_check.nf | 13 +++--- .../samplesheet-make-names-unique.csv | 5 ++ .../local/input_check/input_check.nf.test | 46 +++++++++++++++++-- 3 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 tests/data/samplesheets/samplesheet-make-names-unique.csv diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 462f7c2d..b8c327d3 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -23,24 +23,25 @@ workflow INPUT_CHECK { skip_duplicate_check: true).map { // Create grouping value meta -> + // Remove any unallowed charactars in the meta.id field - meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') + meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\-]/, '_') if (meta[0].external_id != null) { // remove any charactars in the external_id that should not be used - meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') + meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_\-]/, '_') }else{ meta[0].external_id = meta[0].id } - if(processedIDs.contains(meta.id) && params.skip_read_merging){ + if(processedIDs.contains(meta[0].id) && params.skip_read_merging){ // If the id is already contained and read merging is not to be // performed, then we make the id's unique to proceed with processing // read merging is set to false by default, so that when it is run // in IRIDANext reads are only merged in irida next - while (processedIDs.contains(meta.id)) { - meta.id = "${meta.id}_${meta.external_id}" + while (processedIDs.contains(meta[0].id)) { + meta[0].id = "${meta[0].id}_${meta[0].external_id}" } }else{ // Set the external id to the input ID. @@ -48,7 +49,7 @@ workflow INPUT_CHECK { } - processedIDs << meta.id + processedIDs << meta[0].id tuple(meta[0].id, meta[0]) } diff --git a/tests/data/samplesheets/samplesheet-make-names-unique.csv b/tests/data/samplesheets/samplesheet-make-names-unique.csv new file mode 100644 index 00000000..09d8672c --- /dev/null +++ b/tests/data/samplesheets/samplesheet-make-names-unique.csv @@ -0,0 +1,5 @@ +sample,fastq_1,fastq_2,long_reads,assembly +ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,, +ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,, +ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,, +ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads.fastq,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq,, diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test index 8fff14fa..dcf977dd 100644 --- a/tests/subworkflows/local/input_check/input_check.nf.test +++ b/tests/subworkflows/local/input_check/input_check.nf.test @@ -99,14 +99,52 @@ nextflow_workflow { then { - // TODO beef up assertions assert workflow.success - println workflow.out.reads - //assert workflow.out.reads[0][0].id == "better_faster_stronger_name" + assert workflow.out.reads[0][0].id == 'an_even_stronger_name_' + assert workflow.out.reads[0][0].merge == false + assert workflow.out.reads[1][0].id == 'better_faster_stronger_name' + assert workflow.out.reads[1][0].merge == false + assert workflow.out.reads[2][0].id == 'this_is_getting_ridiculous' + assert workflow.out.reads[2][0].merge == true + assert workflow.out.reads[2][1][0].endsWith("this_is_getting_ridiculous_R1.merged.fastq.gz") + assert workflow.out.reads[2][1][1].endsWith("this_is_getting_ridiculous_R2.merged.fastq.gz") + } + + } + + test("Test make ids unique") { + tag "pass_make_ids_unique" + + when { + + params { + input = "$baseDir/tests/data/samplesheets/samplesheet-make-names-unique.csv" + outdir = "results" + min_reads = 1 + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + skip_read_merging = true + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + + assert workflow.success + assert workflow.out.reads[0][0].id = ha + assert workflow.out.reads[1][0].id = ha_ha + assert workflow.out.reads[2][0].id = ha_ha_ha + assert workflow.out.reads[3][0].id = ha_ha_ha_ha } } - // TODO add test case for making read ids unique } From 3c4e1c4a61248797fd7e71c3f206bb0e279e546d Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 7 Nov 2024 14:26:07 -0600 Subject: [PATCH 25/32] fixed my own mistakes --- tests/subworkflows/local/input_check/input_check.nf.test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test index dcf977dd..ec28cc7e 100644 --- a/tests/subworkflows/local/input_check/input_check.nf.test +++ b/tests/subworkflows/local/input_check/input_check.nf.test @@ -138,10 +138,10 @@ nextflow_workflow { then { assert workflow.success - assert workflow.out.reads[0][0].id = ha - assert workflow.out.reads[1][0].id = ha_ha - assert workflow.out.reads[2][0].id = ha_ha_ha - assert workflow.out.reads[3][0].id = ha_ha_ha_ha + assert workflow.out.reads[0][0].id == 'ha' + assert workflow.out.reads[1][0].id == 'ha_ha' + assert workflow.out.reads[2][0].id == 'ha_ha_ha' + assert workflow.out.reads[3][0].id == 'ha_ha_ha_ha' } } From 738943b31aa5fa286dcff056cb2937d252b684cc Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 7 Nov 2024 15:05:01 -0600 Subject: [PATCH 26/32] fixed failing test --- .../local/input_check/input_check.nf.test | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test index ec28cc7e..6eb54be9 100644 --- a/tests/subworkflows/local/input_check/input_check.nf.test +++ b/tests/subworkflows/local/input_check/input_check.nf.test @@ -30,7 +30,17 @@ nextflow_workflow { then { assert workflow.success - assert workflow.out.reads == [[['id':'CSE', 'sample':'CSE', 'hybrid':false, 'assembly':false, 'downsampled':false, 'single_end':false, 'merge':false], ['/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz', '/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz']]] + assert workflow.out.reads[0][0].id == 'CSE' + assert workflow.out.reads[0][0].external_id == 'CSE' + assert workflow.out.reads[0][0].sample == 'CSE' + assert workflow.out.reads[0][0].hybrid == false + assert workflow.out.reads[0][0].assembly == false + assert workflow.out.reads[0][0].downsampled == false + assert workflow.out.reads[0][0].single_end == false + assert workflow.out.reads[0][0].merge == false + assert workflow.out.reads[0][1][0].endsWith('campy-staph1.fq.gz') + assert workflow.out.reads[0][1][1].endsWith('campy-staph2.fq.gz') + } } From b1e60dd7e05670bcd6f542fd55c2fe2fe4220b59 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 8 Nov 2024 12:05:26 -0600 Subject: [PATCH 27/32] swapped external_id and id --- assets/schema_input.json | 4 ++-- subworkflows/local/input_check.nf | 24 +++++++++++------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index fd8b8c2a..39a3c830 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -10,13 +10,13 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "meta": ["id"], + "meta": ["external_id"], "errorMessage": "Sample name to be used in report generation. Invalid characters are replaces with underscores." }, "sample_name": { "type": "string", "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Invalid characters will be replaced with underscores.", - "meta": ["external_id"] + "meta": ["id"] }, "fastq_1": { "type": "string", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index b8c327d3..4385fb05 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -24,14 +24,16 @@ workflow INPUT_CHECK { // Create grouping value meta -> - // Remove any unallowed charactars in the meta.id field - meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\-]/, '_') - - if (meta[0].external_id != null) { - // remove any charactars in the external_id that should not be used - meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_\-]/, '_') - }else{ - meta[0].external_id = meta[0].id + // Verify file names do not start with periods as the files can end up being treated as + // hidden files causing odd issues later on in the pipeline + + if(meta[0].id == null){ + // Remove any unallowed charactars in the meta.id field + meta[0].id = meta[0].external_id.replaceAll(/^\./, '_') + meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') + }else { + meta[0].id = meta[0].id.replaceAll(/^\./, '_') + meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') } @@ -43,12 +45,8 @@ workflow INPUT_CHECK { while (processedIDs.contains(meta[0].id)) { meta[0].id = "${meta[0].id}_${meta[0].external_id}" } - }else{ - // Set the external id to the input ID. - meta[0].external_id = meta[0].id } - processedIDs << meta[0].id tuple(meta[0].id, meta[0]) } @@ -135,7 +133,7 @@ def format_reads(ArrayList sheet_data){ def meta = [:] def error_occured = false meta.id = sheet_data[0] // id is first value - meta.sample = sheet_data[0] // Sample will be id currently + meta.sample = sheet_data[1].external_id meta.external_id = sheet_data[1].external_id meta.hybrid = false From 70d02912b9a958334dc4f39105cd22a3b5c52637 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 8 Nov 2024 16:45:31 -0600 Subject: [PATCH 28/32] updating information before the weekend --- CHANGELOG.md | 2 +- modules/local/report.nf | 8 +++++++- subworkflows/local/input_check.nf | 9 +++++---- ...plesheet-test-from-assemblies-vibrio-stupid-names.csv | 4 ++-- tests/subworkflows/local/input_check/input_check.nf.test | 2 +- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98c6172b..62f2a563 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) -- Added a new field to the `schema_input.json` file to allow for sample ID's from external systems such as IRIDA Next: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) +- Added a new `sample_name` field to the `schema_input.json` file: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) - Incorporated a `--skip_read_merging` parameter to prevent read merging [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) diff --git a/modules/local/report.nf b/modules/local/report.nf index 8a469955..f361a0f2 100644 --- a/modules/local/report.nf +++ b/modules/local/report.nf @@ -42,18 +42,22 @@ process REPORT{ def report_value = test_in[i+2] if(!sample_data.containsKey(meta_data.sample)){ + // Todo issue grabbing correct tag is here sample_data[meta_data.sample] = [:] sample_data[meta_data.sample]["meta"] = [:] } update_map_values(sample_data, meta_data, "metagenomic") - update_map_values(sample_data, meta_data, "external_id") + //update_map_values(sample_data, meta_data, "sample") + //update_map_values(sample_data, meta_data, "external_id") + //update_map_values(sample_data, meta_data, "id") update_map_values(sample_data, meta_data, "assembly") update_map_values(sample_data, meta_data, "hybrid") update_map_values(sample_data, meta_data, "single_end") update_map_values(sample_data, meta_data, "merge") update_map_values(sample_data, meta_data, "downsampled") + if(!sample_data[meta_data.sample].containsKey(meta_data.id)){ sample_data[meta_data.sample][meta_data.id] = [:] } @@ -678,6 +682,8 @@ def generate_qc_data(data, search_phrases, qc_species_tag){ def species_tag_location = 0 for(k in data){ if(!k.value.meta.metagenomic){ + println k.value + println k.key def species = get_species(k.value[k.key][top_hit_tag], search_phrases, shortest_token) // update coverage first so its values can be used in generating qc messages generate_coverage_data(data[k.key], params.coverage_calc_fields.bp_field, species) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 4385fb05..edbcb5f0 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -24,16 +24,17 @@ workflow INPUT_CHECK { // Create grouping value meta -> + // Verify file names do not start with periods as the files can end up being treated as // hidden files causing odd issues later on in the pipeline if(meta[0].id == null){ // Remove any unallowed charactars in the meta.id field meta[0].id = meta[0].external_id.replaceAll(/^\./, '_') - meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') + meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\.\-]/, '_') }else { meta[0].id = meta[0].id.replaceAll(/^\./, '_') - meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_') + meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\.\-]/, '_') } @@ -46,7 +47,7 @@ workflow INPUT_CHECK { meta[0].id = "${meta[0].id}_${meta[0].external_id}" } } - + println "${meta[0].id} ${meta[0]}" processedIDs << meta[0].id tuple(meta[0].id, meta[0]) } @@ -134,7 +135,7 @@ def format_reads(ArrayList sheet_data){ def error_occured = false meta.id = sheet_data[0] // id is first value meta.sample = sheet_data[1].external_id - meta.external_id = sheet_data[1].external_id + meta.external_id = sheet_data[0] meta.hybrid = false meta.assembly = false diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv index 0a1d49a8..b3227593 100644 --- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv +++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv @@ -1,2 +1,2 @@ -sample_name,sample,fastq_1,fastq_2,long_reads,assembly -INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz +sample,sample_name,fastq_1,fastq_2,long_reads,assembly +INX,.iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test index 6eb54be9..df362557 100644 --- a/tests/subworkflows/local/input_check/input_check.nf.test +++ b/tests/subworkflows/local/input_check/input_check.nf.test @@ -112,7 +112,7 @@ nextflow_workflow { assert workflow.success assert workflow.out.reads[0][0].id == 'an_even_stronger_name_' assert workflow.out.reads[0][0].merge == false - assert workflow.out.reads[1][0].id == 'better_faster_stronger_name' + assert workflow.out.reads[1][0].id == 'better.faster.stronger.name' assert workflow.out.reads[1][0].merge == false assert workflow.out.reads[2][0].id == 'this_is_getting_ridiculous' assert workflow.out.reads[2][0].merge == true From 6ba57b0d613792c00db142e71d996de82de7e611 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 12 Nov 2024 10:13:12 -0600 Subject: [PATCH 29/32] fixed stupid name issue report keys not found --- bin/report_summaries.py | 5 ++--- modules/local/report.nf | 14 ++++++-------- subworkflows/local/input_check.nf | 3 +-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/bin/report_summaries.py b/bin/report_summaries.py index 73b17617..17ca74e3 100755 --- a/bin/report_summaries.py +++ b/bin/report_summaries.py @@ -264,11 +264,10 @@ def output_indv_json(self, flattened_data): """ updated_items = dict() for k, v in flattened_data.items(): - #out_path = os.path.join(self.output_dir, k + self.flat_sample_string) out_key = k sample_dir = k - if v.get(self.__inx_irida_key) != k: - sample_dir = v[self.__inx_irida_key] + if dir_name := v.get(self.__inx_irida_key) != k: + sample_dir = dir_name #! this field affects the identification of the irida next id being passed out of the pipeline out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field diff --git a/modules/local/report.nf b/modules/local/report.nf index f361a0f2..bd14fce7 100644 --- a/modules/local/report.nf +++ b/modules/local/report.nf @@ -41,6 +41,7 @@ process REPORT{ def report_tag = test_in[i+1] def report_value = test_in[i+2] + println meta_data if(!sample_data.containsKey(meta_data.sample)){ // Todo issue grabbing correct tag is here sample_data[meta_data.sample] = [:] @@ -48,18 +49,16 @@ process REPORT{ } update_map_values(sample_data, meta_data, "metagenomic") - //update_map_values(sample_data, meta_data, "sample") - //update_map_values(sample_data, meta_data, "external_id") - //update_map_values(sample_data, meta_data, "id") + update_map_values(sample_data, meta_data, "sample") + update_map_values(sample_data, meta_data, "external_id") update_map_values(sample_data, meta_data, "assembly") update_map_values(sample_data, meta_data, "hybrid") update_map_values(sample_data, meta_data, "single_end") update_map_values(sample_data, meta_data, "merge") update_map_values(sample_data, meta_data, "downsampled") - - if(!sample_data[meta_data.sample].containsKey(meta_data.id)){ - sample_data[meta_data.sample][meta_data.id] = [:] + if(!sample_data[meta_data.sample].containsKey(meta_data.external_id)){ + sample_data[meta_data.sample][meta_data.external_id] = [:] } if(report_value instanceof Path){ @@ -67,14 +66,13 @@ process REPORT{ if(!check_file_params(report_tag, extension)){ continue } - // TODO pass in report metadata def output_data = parse_data(report_value, extension, report_tag, headers_list) if(output_data){ report_value = output_data } } - sample_data[meta_data.sample][meta_data.id][report_tag.report_tag] = report_value + sample_data[meta_data.sample][meta_data.external_id][report_tag.report_tag] = report_value } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index edbcb5f0..8fd70a64 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -47,7 +47,6 @@ workflow INPUT_CHECK { meta[0].id = "${meta[0].id}_${meta[0].external_id}" } } - println "${meta[0].id} ${meta[0]}" processedIDs << meta[0].id tuple(meta[0].id, meta[0]) } @@ -135,7 +134,7 @@ def format_reads(ArrayList sheet_data){ def error_occured = false meta.id = sheet_data[0] // id is first value meta.sample = sheet_data[1].external_id - meta.external_id = sheet_data[0] + meta.external_id = sheet_data[1].external_id meta.hybrid = false meta.assembly = false From a2c56a8b1f67cd98c0d3dc8602f4da5daa38652d Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 12 Nov 2024 11:59:35 -0600 Subject: [PATCH 30/32] fixed failig test case --- modules/local/report.nf | 4 -- tests/main.nf.test | 84 ++++++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 46 deletions(-) diff --git a/modules/local/report.nf b/modules/local/report.nf index bd14fce7..5b131278 100644 --- a/modules/local/report.nf +++ b/modules/local/report.nf @@ -41,9 +41,7 @@ process REPORT{ def report_tag = test_in[i+1] def report_value = test_in[i+2] - println meta_data if(!sample_data.containsKey(meta_data.sample)){ - // Todo issue grabbing correct tag is here sample_data[meta_data.sample] = [:] sample_data[meta_data.sample]["meta"] = [:] } @@ -680,8 +678,6 @@ def generate_qc_data(data, search_phrases, qc_species_tag){ def species_tag_location = 0 for(k in data){ if(!k.value.meta.metagenomic){ - println k.value - println k.key def species = get_species(k.value[k.key][top_hit_tag], search_phrases, shortest_token) // update coverage first so its values can be used in generating qc messages generate_coverage_data(data[k.key], params.coverage_calc_fields.bp_field, species) diff --git a/tests/main.nf.test b/tests/main.nf.test index ef559da2..b3fde7cd 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -155,45 +155,45 @@ nextflow_pipeline { // parse output json file def json = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json - assert json.short.short.RawReadSummary.R1."total_bp".equals(118750) - assert json.short.short.RawReadSummary.R1."total_reads".equals(475) - assert json.short.short.RawReadSummary.R1."read_qual_mean".equals(40.0) - assert json.short.short.RawReadSummary.R1."mean_sequence_length".equals(250.0) - - assert json.short.short.FastP.summary.sequencing.equals("paired end (250 cycles + 250 cycles)") - assert json.short.short.FastP.summary.before_filtering.total_reads.equals(950) - assert json.short.short.FastP.filtering_result.passed_filter_reads.equals(950) - assert json.short.short.FastP.filtering_result.low_quality_reads.equals(0) - assert json.short.short.FastP.insert_size.peak.equals(347) - - //assert json.short.meta.metagenomic.equals(false) // Currently, this is "null". - assert json.short.meta.assembly.equals(false) - assert json.short.meta.hybrid.equals(false) - assert json.short.meta.single_end.equals(false) - assert json.short.meta.merge.equals(false) - assert json.short.meta.downsampled.equals(false) - - assert json.short.short.AssemblyCompleted.equals(true) - assert json.short.short.QUAST."0"."Total length (>= 0 bp)".equals("4949") - assert json.short.short.QUAST."0"."Largest contig".equals("4949") - assert json.short.short.QUAST."0"."GC (%)".equals("52.96") - assert json.short.short.QUAST."0"."Avg. coverage depth".equals("47") + assert json.INX.INX.RawReadSummary.R1."total_bp".equals(118750) + assert json.INX.INX.RawReadSummary.R1."total_reads".equals(475) + assert json.INX.INX.RawReadSummary.R1."read_qual_mean".equals(40.0) + assert json.INX.INX.RawReadSummary.R1."mean_sequence_length".equals(250.0) + + assert json.INX.INX.FastP.summary.sequencing.equals("paired end (250 cycles + 250 cycles)") + assert json.INX.INX.FastP.summary.before_filtering.total_reads.equals(950) + assert json.INX.INX.FastP.filtering_result.passed_filter_reads.equals(950) + assert json.INX.INX.FastP.filtering_result.low_quality_reads.equals(0) + assert json.INX.INX.FastP.insert_size.peak.equals(347) + + //assert json.INX.meta.metagenomic.equals(false) // Currently, this is "null". + assert json.INX.meta.assembly.equals(false) + assert json.INX.meta.hybrid.equals(false) + assert json.INX.meta.single_end.equals(false) + assert json.INX.meta.merge.equals(false) + assert json.INX.meta.downsampled.equals(false) + + assert json.INX.INX.AssemblyCompleted.equals(true) + assert json.INX.INX.QUAST."0"."Total length (>= 0 bp)".equals("4949") + assert json.INX.INX.QUAST."0"."Largest contig".equals("4949") + assert json.INX.INX.QUAST."0"."GC (%)".equals("52.96") + assert json.INX.INX.QUAST."0"."Avg. coverage depth".equals("47") // Below two values should be empty - assert json.short.short.StarAMR."0"."Genotype".equals("None") - assert json.short.short.StarAMR."0"."Predicted Phenotype".equals("Susceptible") - assert json.short.short.StarAMR."0"."Genome Length".equals("4949") + assert json.INX.INX.StarAMR."0"."Genotype".equals("None") + assert json.INX.INX.StarAMR."0"."Predicted Phenotype".equals("Susceptible") + assert json.INX.INX.StarAMR."0"."Genome Length".equals("4949") - assert json.short.short.CheckM."0"."# genomes".equals("5656") - assert json.short.short.CheckM."0"."# markers".equals("56") - assert json.short.short.CheckM."0"."# marker sets".equals("24") - assert json.short.short.CheckM."0".Contamination.equals("0.00") + assert json.INX.INX.CheckM."0"."# genomes".equals("5656") + assert json.INX.INX.CheckM."0"."# markers".equals("56") + assert json.INX.INX.CheckM."0"."# marker sets".equals("24") + assert json.INX.INX.CheckM."0".Contamination.equals("0.00") - assert json.short.short.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz") + assert json.INX.INX.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz") - assert json.short.short.Abricate."0".RESISTANCE.equals("NoData") // All Abricate results for this are "NoData". + assert json.INX.INX.Abricate."0".RESISTANCE.equals("NoData") // All Abricate results for this are "NoData". - def assembly_path = "$launchDir/results/Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" + def assembly_path = "$launchDir/results/Assembly/FinalAssembly/INX/short.final.filtered.assembly.fasta.gz" assert path(assembly_path).exists() // parse assembly file @@ -209,17 +209,17 @@ nextflow_pipeline { // output files assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1 assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 - assert iridanext_samples.short.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/FinalAssembly/INX/short.final.filtered.assembly.fasta.gz" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1 + assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/INX.flat_sample.json.gz" }.size() == 1 // output metadata - assert iridanext_metadata.short."QC Status" == "PASSED" + assert iridanext_metadata.INX."QC Status" == "PASSED" } From a1c3f3eb3b8cea250111e807940c213fd07959fc Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 12 Nov 2024 14:14:35 -0600 Subject: [PATCH 31/32] updated changelog --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62f2a563..3b3d7fd1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) -- Sample names (`sample_name` field) can no longer begin with a period. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) ### `Updated` From 899e35b949045ee31c4307dbb6bcc3ac272e33f9 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 12 Nov 2024 14:15:58 -0600 Subject: [PATCH 32/32] updated changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b3d7fd1..90dbd1bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` -- Added a `sample_name` field, `sample` still exists but is used for different purposes [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) +- Added a `sample_name` field, `sample` still exists but is used to incorporate additional names/identifiers in IRIDANext [PR 140](https://github.com/phac-nml/mikrokondo/pull/140) - RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)