From 365f47b6ee0354b388326348829b5a4a4b1fa271 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 24 Oct 2024 14:25:58 -0500
Subject: [PATCH 01/32] added sample irida_next sample field option

---
 assets/schema_input.json          |  9 ++++++++-
 nextflow.config                   |  2 +-
 subworkflows/local/input_check.nf | 12 +++++++++---
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index 1e092ed1..c6defb68 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -7,10 +7,17 @@
     "items": {
         "type": "object",
         "properties": {
+            "sample_name": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "meta": ["irida_id"],
+                "unique": true,
+                "errorMessage": "Sample name must be provided and cannot contain spaces"
+            },
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces",
+                "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next.",
                 "meta": ["id"]
             },
             "fastq_1": {
diff --git a/nextflow.config b/nextflow.config
index 8a0c5552..c90c43e6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -43,7 +43,7 @@ params {
     show_hidden_params = false
     validationS3PathCheck = true
     validationShowHiddenParams = false
-    validationSchemaIgnoreParams = 'locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
+    validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
     validationFailUnrecognisedParams = false // for the qcreport fields
 
     // SKIP options
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 2792d88b..1cd2787b 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -9,7 +9,6 @@ workflow INPUT_CHECK {
 
     main:
 
-    // TODO add in automatic gzipping of all samples in
     versions = Channel.empty()
     def sample_sheet = params.input
     reads_in = Channel.fromSamplesheet(
@@ -102,8 +101,15 @@ def check_file_exists(def file_path){
 def format_reads(ArrayList sheet_data){
     def meta = [:]
     def error_occured = false
-    meta.id = sheet_data[0] // id is first value
-    meta.sample = sheet_data[0] // Sample will be id currently
+    if(sheet_data[1].irida_id != null){
+        meta.irida_id = sheet_data[1].irida_id
+        meta.id = sheet_data[0] // id is first value
+        meta.sample = sheet_data[0] // Sample will be id currently
+    }else{
+        meta.id = sheet_data[0] // id is first value
+        meta.sample = sheet_data[0] // Sample will be id currently
+    }
+
     meta.hybrid = false
     meta.assembly = false
     meta.downsampled = false

From 7edf2aa35d75c7000984ec02138c6441bc6e71f2 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 24 Oct 2024 17:01:39 -0500
Subject: [PATCH 02/32] identified sticking point for sample names not being
 passed to the iridanext config

---
 conf/irida_next.config  |   2 +-
 main.nf                 |   1 +
 modules/local/report.nf |   4 ++
 nextflow.config         |   3 +-
 tests/main.nf.test      | 126 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/conf/irida_next.config b/conf/irida_next.config
index 24cc2d07..567e570c 100755
--- a/conf/irida_next.config
+++ b/conf/irida_next.config
@@ -11,7 +11,7 @@ iridanext {
         overwrite = true
         validate = false
         files {
-            idkey = "sample"
+            idkey = "irida_id"
             global = [
                 "**/FinalReports/Aggregated/Json/final_report.json",
                 "**/FinalReports/Aggregated/Tables/final_report.tsv"
diff --git a/main.nf b/main.nf
index 71223572..24336b72 100644
--- a/main.nf
+++ b/main.nf
@@ -112,6 +112,7 @@ workflow MIKROKONDO {
         ch_versions = ch_versions.mix(REPORT_AGGREGATE.out.versions)
 
 
+        // TODO need to add logic to merge this channel with a previous one to get its INX id
         updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{
                     sample ->
                         def name_trim = sample.getName()
diff --git a/modules/local/report.nf b/modules/local/report.nf
index 0eccfe08..9b9b015a 100644
--- a/modules/local/report.nf
+++ b/modules/local/report.nf
@@ -47,6 +47,10 @@ process REPORT{
             sample_data[meta_data.sample]["meta"] = [:]
         }
 
+
+        // TODO add a condition around this to only be appened if iridanext is enabled
+        update_map_values(sample_data, meta_data, "irida_id")
+
         update_map_values(sample_data, meta_data, "metagenomic")
         update_map_values(sample_data, meta_data, "assembly")
         update_map_values(sample_data, meta_data, "hybrid")
diff --git a/nextflow.config b/nextflow.config
index c90c43e6..256766f3 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -7,6 +7,7 @@
 */
 
 
+
 // Global default params, used in configs
 params {
     // Input options
@@ -43,7 +44,7 @@ params {
     show_hidden_params = false
     validationS3PathCheck = true
     validationShowHiddenParams = false
-    validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
+    validationSchemaIgnoreParams = '__in_iridanext,rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
     validationFailUnrecognisedParams = false // for the qcreport fields
 
     // SKIP options
diff --git a/tests/main.nf.test b/tests/main.nf.test
index 261ff7cb..5d741ee6 100644
--- a/tests/main.nf.test
+++ b/tests/main.nf.test
@@ -100,6 +100,132 @@ nextflow_pipeline {
         }
     }
 
+        test("Should run without failure unzipped IRIDANext id") {
+        tag "succeed_assembly_inx_id"
+
+        when {
+            params {
+                input = "https://raw.githubusercontent.com/phac-nml/mikrokondo/refs/heads/dev/tests/data/samplesheets/samplesheet-small-assembly-inx.csv"
+                outdir = "results"
+                platform = "illumina"
+
+                mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
+                mh_min_kmer = 1
+
+                dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+                kraken2_db = "$baseDir/tests/data/kraken2/test"
+
+                min_reads = 100
+                skip_allele_calling = true
+
+                QCReport {
+                    fallthrough {
+                        search = "No organism specific QC data available."
+                        raw_average_quality = 30
+                        min_n50 = null
+                        max_n50 = null
+                        min_nr_contigs = null
+                        max_nr_contigs = null
+                        fixed_genome_size = 1000
+                        min_length = null
+                        max_length = null
+                        max_checkm_contamination = 3.0
+                        min_average_coverage = 30
+                    }
+                }
+
+
+                skip_bakta = true
+                skip_staramr = false
+                skip_mobrecon = false
+                skip_checkm = false
+                skip_raw_read_metrics = false
+                skip_polishing = false
+
+                max_memory = "2.GB"
+                max_cpus = 1
+            }
+        }
+
+        then {
+
+            assert workflow.success
+            assert path("$launchDir/results").exists()
+
+            // parse output json file
+            def json = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json
+
+            assert json.short.short.RawReadSummary.R1."total_bp".equals(118750)
+            assert json.short.short.RawReadSummary.R1."total_reads".equals(475)
+            assert json.short.short.RawReadSummary.R1."read_qual_mean".equals(40.0)
+            assert json.short.short.RawReadSummary.R1."mean_sequence_length".equals(250.0)
+
+            assert json.short.short.FastP.summary.sequencing.equals("paired end (250 cycles + 250 cycles)")
+            assert json.short.short.FastP.summary.before_filtering.total_reads.equals(950)
+            assert json.short.short.FastP.filtering_result.passed_filter_reads.equals(950)
+            assert json.short.short.FastP.filtering_result.low_quality_reads.equals(0)
+            assert json.short.short.FastP.insert_size.peak.equals(347)
+
+            //assert json.short.meta.metagenomic.equals(false)  // Currently, this is "null".
+            assert json.short.meta.assembly.equals(false)
+            assert json.short.meta.hybrid.equals(false)
+            assert json.short.meta.single_end.equals(false)
+            assert json.short.meta.merge.equals(false)
+            assert json.short.meta.downsampled.equals(false)
+
+            assert json.short.short.AssemblyCompleted.equals(true)
+            assert json.short.short.QUAST."0"."Total length (>= 0 bp)".equals("4949")
+            assert json.short.short.QUAST."0"."Largest contig".equals("4949")
+            assert json.short.short.QUAST."0"."GC (%)".equals("52.96")
+            assert json.short.short.QUAST."0"."Avg. coverage depth".equals("47")
+
+            // Below two values should be empty
+            assert json.short.short.StarAMR."0"."Genotype".equals("None")
+            assert json.short.short.StarAMR."0"."Predicted Phenotype".equals("Susceptible")
+            assert json.short.short.StarAMR."0"."Genome Length".equals("4949")
+
+            assert json.short.short.CheckM."0"."# genomes".equals("5656")
+            assert json.short.short.CheckM."0"."# markers".equals("56")
+            assert json.short.short.CheckM."0"."# marker sets".equals("24")
+            assert json.short.short.CheckM."0".Contamination.equals("0.00")
+
+            assert json.short.short.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz")
+
+            assert json.short.short.Abricate."0".RESISTANCE.equals("NoData")  // All Abricate results for this are "NoData".
+
+            def assembly_path = "$launchDir/results/Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz"
+            assert path(assembly_path).exists()
+
+            // parse assembly file
+            def assembly_header = path(assembly_path).linesGzip[0]
+            assert assembly_header.startsWith(">NODE_1_length_4949_cov_23.917254")  // _pilon_pilon_pilon gets appended
+
+            // compare IRIDA Next JSON output
+            def iridanext_json = path("$launchDir/results/iridanext.output.json").json
+            def iridanext_global = iridanext_json.files.global
+            def iridanext_samples = iridanext_json.files.samples
+            def iridanext_metadata = iridanext_json.metadata.samples
+
+            // output files
+            assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1
+
+            // output metadata
+            assert iridanext_metadata.INX."QC Status" == "PASSED"
+
+        }
+
+    }
+
+
     test("Should run without failure.") {
         tag "succeed_assembly"
 

From 351a8f11aeeccb7eac45e18443b8766d162b4364 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 28 Oct 2024 14:39:53 -0500
Subject: [PATCH 03/32] updated iridanext external name id

---
 assets/schema_input.json          |  6 +++---
 bin/report_summaries.py           | 29 ++++++++++++++++++++--------
 conf/irida_next.config            |  2 +-
 main.nf                           | 32 +++++++++++++++++++++++--------
 modules/local/report.nf           |  6 +-----
 modules/local/report_aggregate.nf |  2 +-
 nextflow.config                   |  6 ++----
 subworkflows/local/input_check.nf | 17 ++++++++--------
 tests/main.nf.test                |  2 +-
 9 files changed, 63 insertions(+), 39 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index c6defb68..43321486 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -9,10 +9,10 @@
         "properties": {
             "sample_name": {
                 "type": "string",
-                "pattern": "^\\S+$",
-                "meta": ["irida_id"],
+                "pattern": "^[^\\s\\.]+$",
+                "meta": ["external_id"],
                 "unique": true,
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
+                "errorMessage": "This field cannot contain .iridanext_output."
             },
             "sample": {
                 "type": "string",
diff --git a/bin/report_summaries.py b/bin/report_summaries.py
index a250f753..db4f08ee 100755
--- a/bin/report_summaries.py
+++ b/bin/report_summaries.py
@@ -37,9 +37,11 @@ class JsonImport:
     __keep_keys = frozenset(__key_order.keys())
     __delimiter = "\t"
     __key_delimiter = "."
+    __inx_irida_key = "meta.external_id"
 
-    def __init__(self, report_fp, output_name, sample_suffix):
-        self.tool_data = None # TODO set this in output of group tool fields
+    def __init__(self, report_fp, output_name, sample_suffix, inx_id_token):
+        self.inx_id_token = inx_id_token
+        self.tool_data = None
         self.output_name = output_name
         self.output_transposed = os.path.splitext(os.path.basename(self.output_name))[0] + "_transposed.tsv"
         self.output_dir = os.path.dirname(self.output_name)
@@ -49,7 +51,7 @@ def __init__(self, report_fp, output_name, sample_suffix):
         self.flat_sample_string = sample_suffix
         self.data = self.ingest_report(self.report_fp)
         self.flat_data, self.common_fields, self.tool_fields, self.table = self.flatten_json(self.data)
-        self.output_indv_json(self.flat_data)
+        self.flat_data = self.output_indv_json(self.flat_data)
         self.output_flat_json(self.flat_data)
         self.write_table(self.table)
 
@@ -233,7 +235,6 @@ def remove_prefix_id_fields(self, flattened_dict):
                     top_level_keys.add(item_key)
                 temp[item_key] = v
 
-        #self.tool_data = tool_data
         return reformatted_data, top_level_keys, tool_keys
 
 
@@ -262,11 +263,22 @@ def output_indv_json(self, flattened_data):
         Args:
             flattened_data (json: Dict[sample_id: Dict[tool_info: value]]):
         """
+        updated_items = dict()
         for k, v in flattened_data.items():
-            with open(os.path.join(self.output_dir, k + self.flat_sample_string), "w") as output:
+            out_path = os.path.join(self.output_dir, k + self.flat_sample_string)
+            out_key = k
+            if inx_id := v.get(self.__inx_irida_key):
+                #! this field affects the identification of the irida next id being passed out of the pipeline
+                out_path = os.path.join(self.output_dir, k + self.inx_id_token + inx_id + self.flat_sample_string)
+                out_key = inx_id # this field must be overwritten for iridanext to identify the correct metdata field
+
+            with open(out_path, "w") as output:
                 json_data = json.dumps({k: v}, indent=2)
                 output.write(json_data)
+            updated_items[out_key] = v
 
+        flattened_data = updated_items
+        return flattened_data
 
     def to_file(self):
         with open(self.output_name, "w") as out_file:
@@ -291,15 +303,16 @@ def to_file(self):
 
 
 
-def main_(args_in):
+def main(args_in):
     default_samp_suffix = "_flat_sample.json"
     parser = argparse.ArgumentParser("Table Summary")
     parser.add_argument("-f", "--file-in", help="Path to the mikrokondo json summary")
     parser.add_argument("-s", "--sample-tag", help="Optional suffix and extension to name output samples.", default=default_samp_suffix)
     parser.add_argument("-o", "--out-file", help="output name plus the .tsv extension e.g. prefix.tsv")
+    parser.add_argument("-x", "--inx-id-token", help="A token to insert into the flattened json file names for separation of the irida next sample id.")
     args = parser.parse_args(args_in)
     if os.path.isfile(args.file_in):
-        JsonImport(args.file_in, args.out_file, args.sample_tag)
+        JsonImport(args.file_in, args.out_file, args.sample_tag, args.inx_id_token)
     else:
         sys.stderr.write(f"{args.file_in} does not exist.\n")
         sys.exit(-1)
@@ -307,4 +320,4 @@ def main_(args_in):
 
 if __name__ == "__main__":
     # pass json file to program to parse it
-    main_(sys.argv[1:])
+    main(sys.argv[1:])
diff --git a/conf/irida_next.config b/conf/irida_next.config
index 567e570c..c2a03d41 100755
--- a/conf/irida_next.config
+++ b/conf/irida_next.config
@@ -11,7 +11,7 @@ iridanext {
         overwrite = true
         validate = false
         files {
-            idkey = "irida_id"
+            idkey = 'external_id'  // Previously sample
             global = [
                 "**/FinalReports/Aggregated/Json/final_report.json",
                 "**/FinalReports/Aggregated/Tables/final_report.tsv"
diff --git a/main.nf b/main.nf
index 24336b72..6c9b3631 100644
--- a/main.nf
+++ b/main.nf
@@ -42,9 +42,6 @@ if (params.help) {
 if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
 
 
-
-
-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     NAMED WORKFLOW FOR PIPELINE
@@ -111,16 +108,35 @@ workflow MIKROKONDO {
         REPORT_AGGREGATE(REPORT.out.final_report)
         ch_versions = ch_versions.mix(REPORT_AGGREGATE.out.versions)
 
-
-        // TODO need to add logic to merge this channel with a previous one to get its INX id
         updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{
                     sample ->
+                        def inx_string_suffix = params.report_aggregate.inx_string_insertion
                         def name_trim = sample.getName()
                         def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length())
-                        tuple([
+                        def output_map = [
                             "id": trimmed_name,
-                            "sample": trimmed_name],
-                            sample)
+                            "sample": trimmed_name,
+                            "external_id": trimmed_name]
+
+                        def inx_sample_p = trimmed_name.indexOf(params.report_aggregate.inx_string_insertion)
+                        if(inx_sample_p){
+                            if(trimmed_name[0..<inx_string_suffix.length()] == inx_string_suffix && trimmed_name.count(inx_string_suffix) == 2){
+                                // Gaurd statement for the very unlikely situation where someone named there sample whatever the variable inx_string_suffix is set too
+                                output_map.id = inx_string_suffix
+                                output_map.sample = inx_string_suffix
+                                output_map.external_id = inx_string_suffix
+
+                            }else{
+                                def inx_id = trimmed_name.substring(inx_sample_p + inx_string_suffix.length(), trimmed_name.length())
+                                trimmed_name = trimmed_name.substring(0, inx_sample_p)
+                                output_map.id = trimmed_name
+                                output_map.sample = trimmed_name
+                                output_map.external_id = inx_id
+                            }
+
+                        }
+
+                        tuple(output_map, sample)
                     }
 
         GZIP_FILES(updated_samples)
diff --git a/modules/local/report.nf b/modules/local/report.nf
index 9b9b015a..8a469955 100644
--- a/modules/local/report.nf
+++ b/modules/local/report.nf
@@ -43,15 +43,11 @@ process REPORT{
 
         if(!sample_data.containsKey(meta_data.sample)){
             sample_data[meta_data.sample] = [:]
-            // TODO add strings to constants file
             sample_data[meta_data.sample]["meta"] = [:]
         }
 
-
-        // TODO add a condition around this to only be appened if iridanext is enabled
-        update_map_values(sample_data, meta_data, "irida_id")
-
         update_map_values(sample_data, meta_data, "metagenomic")
+        update_map_values(sample_data, meta_data, "external_id")
         update_map_values(sample_data, meta_data, "assembly")
         update_map_values(sample_data, meta_data, "hybrid")
         update_map_values(sample_data, meta_data, "single_end")
diff --git a/modules/local/report_aggregate.nf b/modules/local/report_aggregate.nf
index 36f67c59..3a2cf787 100644
--- a/modules/local/report_aggregate.nf
+++ b/modules/local/report_aggregate.nf
@@ -20,7 +20,7 @@ process REPORT_AGGREGATE{
     script:
     sample_flat_suffix = params.report_aggregate.sample_flat_suffix
     """
-    report_summaries.py -f ${summary_report} -o final_report.tsv -s ${sample_flat_suffix}
+    report_summaries.py -f ${summary_report} -o final_report.tsv -s ${sample_flat_suffix} -x ${params.report_aggregate.inx_string_insertion}
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python --version | sed 's/Python //g')
diff --git a/nextflow.config b/nextflow.config
index 256766f3..c4a13e53 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -48,8 +48,6 @@ params {
     validationFailUnrecognisedParams = false // for the qcreport fields
 
     // SKIP options
-    // TODO need to add constants section
-    // TODO investigate usage of template scripts to replace mash modules
     skip_report = false
     skip_raw_read_metrics = false
     skip_version_gathering = false
@@ -59,7 +57,7 @@ params {
     skip_checkm = false
     skip_depth_sampling = false // TODO have it mentioned that this should be turned off for metagenomic runs
     skip_ont_header_cleaning = true // TODO an awk script can likely replace this and be much faster at what it does...
-    skip_polishing = false // TODO make it clear this does not apply to Hybrid assembly
+    skip_polishing = false
     skip_species_classification = false
     skip_mlst = false
     skip_mobrecon = false
@@ -147,7 +145,6 @@ params {
     version                    = false
 
 
-
     // If a param in camel case is present nextflow automatically creates a kebab case parameter as well
 
     stage_in_mode = 'symlink'
@@ -719,6 +716,7 @@ params {
 
     report_aggregate {
         sample_flat_suffix = ".flat_sample.json"
+        inx_string_insertion = ".iridanext_output."
     }
 
 
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 1cd2787b..d95833a7 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -5,6 +5,9 @@
 include { COMBINE_DATA } from '../../modules/local/combine_data.nf'
 include { fromSamplesheet } from 'plugin/nf-validation'
 
+
+
+
 workflow INPUT_CHECK {
 
     main:
@@ -69,7 +72,7 @@ def reset_combined_map(LinkedHashMap meta, sun.nio.fs.UnixPath f_reads, sun.nio.
     /*Re-format the data to make it similar to make it match the input format again
 
     */
-    // TODO find a way to make this cleaner
+
     def new_meta = meta
     new_meta.merge = true
 
@@ -101,13 +104,11 @@ def check_file_exists(def file_path){
 def format_reads(ArrayList sheet_data){
     def meta = [:]
     def error_occured = false
-    if(sheet_data[1].irida_id != null){
-        meta.irida_id = sheet_data[1].irida_id
-        meta.id = sheet_data[0] // id is first value
-        meta.sample = sheet_data[0] // Sample will be id currently
-    }else{
-        meta.id = sheet_data[0] // id is first value
-        meta.sample = sheet_data[0] // Sample will be id currently
+    meta.id = sheet_data[0] // id is first value
+    meta.sample = sheet_data[0] // Sample will be id currently
+    meta.external_id = sheet_data[0] // This is duplicated to keep later scripting cleaner
+    if(sheet_data[1].external_id != null){
+        meta.external_id = sheet_data[1].external_id
     }
 
     meta.hybrid = false
diff --git a/tests/main.nf.test b/tests/main.nf.test
index 5d741ee6..784fa7be 100644
--- a/tests/main.nf.test
+++ b/tests/main.nf.test
@@ -216,7 +216,7 @@ nextflow_pipeline {
             assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
             assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
             assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.iridanext_output.INX.flat_sample.json.gz" }.size() == 1
 
             // output metadata
             assert iridanext_metadata.INX."QC Status" == "PASSED"

From acdb88421e7840d2fb15be2ff4b1dc5444b6d41d Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 28 Oct 2024 14:50:39 -0500
Subject: [PATCH 04/32] updated changelog and docs

---
 CHANGELOG.md             | 10 +++++++++-
 assets/schema_input.json |  6 +++---
 docs/usage/usage.md      | 12 ++++++------
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d446f39..96f46bc5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,10 +5,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
-### `Changed`
+### `Added`
 
 - Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
 
+- Added a new field to the `schema_input.json` file to allow for sample ID's from external systems such as IRIDA Next: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+### `Changed`
+
+- Added a `sample_name` field, `sample` still exists but is used for different purposes [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
+- RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
+
 ### `Updated`
 
 - Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123)
diff --git a/assets/schema_input.json b/assets/schema_input.json
index 43321486..01aeca1d 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -7,14 +7,14 @@
     "items": {
         "type": "object",
         "properties": {
-            "sample_name": {
+            "sample": {
                 "type": "string",
                 "pattern": "^[^\\s\\.]+$",
                 "meta": ["external_id"],
                 "unique": true,
                 "errorMessage": "This field cannot contain .iridanext_output."
             },
-            "sample": {
+            "sample_name": {
                 "type": "string",
                 "pattern": "^\\S+$",
                 "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next.",
@@ -54,6 +54,6 @@
                 "unique": true
             }
         },
-        "required": ["sample"]
+        "required": ["sample_name"]
     }
 }
diff --git a/docs/usage/usage.md b/docs/usage/usage.md
index b2cf6c6f..09792343 100644
--- a/docs/usage/usage.md
+++ b/docs/usage/usage.md
@@ -17,7 +17,7 @@ This pipeline requires sample files to be gzipped (symlinks may be problematic).
 ### Samplesheet (CSV)
 Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) contains the samples names and allows a user to combine read-sets based on that name if provided. The sample-sheet can utilize the following header fields:
 
-- sample
+- sample_name
 - fastq_1
 - fastq_2
 - long_reads
@@ -28,31 +28,31 @@ Example layouts for different sample-sheets include:
 
 _Illumina paired-end data_
 
-|sample|fastq_1|fastq_2|
+|sample_name|fastq_1|fastq_2|
 |------|-------|-------|
 |sample_name|path_to_forward_reads|path_to_reversed_reads|
 
 _Nanopore_
 
-|sample|long_reads|
+|sample_name|long_reads|
 |------|----------|
 |sample_name|path_to_reads|
 
 _Hybrid Assembly_
 
-|sample|fastq_1|fastq_2|long_reads|
+|sample_name|fastq_1|fastq_2|long_reads|
 |-------|-------|------|----------|
 |sample_name|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
 
 _Starting with assembly only_
 
-|sample|assembly|
+|sample_name|assembly|
 |------|--------|
 |sample_name|path_to_assembly|
 
 _Example merging paired-end data_
 
-|sample|fastq_1|fastq_2|
+|sample_name|fastq_1|fastq_2|
 |------|-------|-------|
 |my_sample|path_to_forward_reads_1|path_to_reversed_reads_1|
 |my_sample|path_to_forward_reads_2|path_to_reversed_reads_2|

From dcdce6d77515dda3e947e09b917335f6e91a8bd4 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 28 Oct 2024 15:47:08 -0500
Subject: [PATCH 05/32] udpated samples sheet names

---
 tests/data/samplesheets/samplesheet-campy-staph.csv           | 2 +-
 tests/data/samplesheets/samplesheet-small-assembly.csv        | 2 +-
 tests/data/samplesheets/samplesheet-small-metagenomic.csv     | 2 +-
 .../samplesheet-test-from-assemblies-listeria.csv             | 2 +-
 .../samplesheet-test-from-assemblies-salmonella.csv           | 2 +-
 .../samplesheets/samplesheet-test-from-assemblies-vibrio.csv  | 2 +-
 tests/data/samplesheets/samplesheet-test-from-assemblies.csv  | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/data/samplesheets/samplesheet-campy-staph.csv b/tests/data/samplesheets/samplesheet-campy-staph.csv
index 203b4804..8bb1350f 100644
--- a/tests/data/samplesheets/samplesheet-campy-staph.csv
+++ b/tests/data/samplesheets/samplesheet-campy-staph.csv
@@ -1,2 +1,2 @@
-sample,fastq_1,fastq_2,long_reads,assembly
+sample_name,fastq_1,fastq_2,long_reads,assembly
 CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
diff --git a/tests/data/samplesheets/samplesheet-small-assembly.csv b/tests/data/samplesheets/samplesheet-small-assembly.csv
index bc658f43..8ea218c7 100644
--- a/tests/data/samplesheets/samplesheet-small-assembly.csv
+++ b/tests/data/samplesheets/samplesheet-small-assembly.csv
@@ -1,2 +1,2 @@
-sample,fastq_1,fastq_2,long_reads,assembly
+sample_name,fastq_1,fastq_2,long_reads,assembly
 short,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
diff --git a/tests/data/samplesheets/samplesheet-small-metagenomic.csv b/tests/data/samplesheets/samplesheet-small-metagenomic.csv
index 3e341b9b..d245aff8 100644
--- a/tests/data/samplesheets/samplesheet-small-metagenomic.csv
+++ b/tests/data/samplesheets/samplesheet-small-metagenomic.csv
@@ -1,2 +1,2 @@
-sample,fastq_1,fastq_2,long_reads,assembly
+sample_name,fastq_1,fastq_2,long_reads,assembly
 meta-small,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv
index 69236531..7bc21370 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv
@@ -1,2 +1,2 @@
-sample,fastq_1,fastq_2,long_reads,assembly
+sample_name,fastq_1,fastq_2,long_reads,assembly
 listeria_GCF_000196035,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/listeria/GCF_000196035.1_ASM19603v1_genomic.fna.gz
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv
index 2526bbe5..d9ca7157 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv
@@ -1,2 +1,2 @@
-sample,fastq_1,fastq_2,long_reads,assembly
+sample_name,fastq_1,fastq_2,long_reads,assembly
 salmonella_GCA_000008105,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/salmonella/GCA_000008105.1_ASM810v1_genomic.fna.gz
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv
index 98a1f026..f475bdc8 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv
@@ -1,2 +1,2 @@
-sample,fastq_1,fastq_2,long_reads,assembly
+sample_name,fastq_1,fastq_2,long_reads,assembly
 st_120,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv
index 07a21039..4de3619c 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv
@@ -1,2 +1,2 @@
-sample,fastq_1,fastq_2,long_reads,assembly
-ecoli_GCA_000947975,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/ecoli/GCA_000947975.1_ASM94797v1_genomic.fna.gz
\ No newline at end of file
+sample_name,fastq_1,fastq_2,long_reads,assembly
+ecoli_GCA_000947975,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/ecoli/GCA_000947975.1_ASM94797v1_genomic.fna.gz

From fd4ea245dac56514ef17da7d8a23fe7f00b65a5c Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 28 Oct 2024 16:56:29 -0500
Subject: [PATCH 06/32] updated inx id parsing

---
 bin/report_summaries.py           |  5 +++--
 main.nf                           | 22 +++++++---------------
 subworkflows/local/input_check.nf |  1 -
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/bin/report_summaries.py b/bin/report_summaries.py
index db4f08ee..8ee4c064 100755
--- a/bin/report_summaries.py
+++ b/bin/report_summaries.py
@@ -243,7 +243,7 @@ def ingest_report(self, report_fp):
         report_fp: File path to the json report to be read in
         """
         data = None
-        with open(report_fp, "r", encoding="utf8") as report:
+        with open(report_fp, "r") as report:
             data = json.load(report)
         return data
 
@@ -267,7 +267,8 @@ def output_indv_json(self, flattened_data):
         for k, v in flattened_data.items():
             out_path = os.path.join(self.output_dir, k + self.flat_sample_string)
             out_key = k
-            if inx_id := v.get(self.__inx_irida_key):
+            if v.get(self.__inx_irida_key) != k:
+                inx_id = v[self.__inx_irida_key]
                 #! this field affects the identification of the irida next id being passed out of the pipeline
                 out_path = os.path.join(self.output_dir, k + self.inx_id_token + inx_id + self.flat_sample_string)
                 out_key = inx_id # this field must be overwritten for iridanext to identify the correct metdata field
diff --git a/main.nf b/main.nf
index 6c9b3631..f32e40c1 100644
--- a/main.nf
+++ b/main.nf
@@ -118,22 +118,14 @@ workflow MIKROKONDO {
                             "sample": trimmed_name,
                             "external_id": trimmed_name]
 
-                        def inx_sample_p = trimmed_name.indexOf(params.report_aggregate.inx_string_insertion)
+                        def inx_sample_p = trimmed_name.contains(inx_string_suffix)
+                        println "inx_sample_p: ${}"
                         if(inx_sample_p){
-                            if(trimmed_name[0..<inx_string_suffix.length()] == inx_string_suffix && trimmed_name.count(inx_string_suffix) == 2){
-                                // Gaurd statement for the very unlikely situation where someone named there sample whatever the variable inx_string_suffix is set too
-                                output_map.id = inx_string_suffix
-                                output_map.sample = inx_string_suffix
-                                output_map.external_id = inx_string_suffix
-
-                            }else{
-                                def inx_id = trimmed_name.substring(inx_sample_p + inx_string_suffix.length(), trimmed_name.length())
-                                trimmed_name = trimmed_name.substring(0, inx_sample_p)
-                                output_map.id = trimmed_name
-                                output_map.sample = trimmed_name
-                                output_map.external_id = inx_id
-                            }
-
+                            def inx_id = trimmed_name.substring(inx_sample_p + inx_string_suffix.length(), trimmed_name.length())
+                            trimmed_name = trimmed_name.substring(0, inx_sample_p)
+                            output_map.id = trimmed_name
+                            output_map.sample = trimmed_name
+                            output_map.external_id = inx_id
                         }
 
                         tuple(output_map, sample)
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index d95833a7..b5d2ebc3 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -22,7 +22,6 @@ workflow INPUT_CHECK {
             meta -> tuple(meta.id[0], meta[0])
         }
 
-
     if(params.opt_platforms.ont == params.platform && params.nanopore_chemistry == null){
         exit 1, "ERROR: Nanopore data was selected without a model being specified."
     }

From 0d81ebfd8051d8902dbd0900789c1738f323e272 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 28 Oct 2024 17:02:41 -0500
Subject: [PATCH 07/32] updated sample sheet parsing

---
 main.nf | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index f32e40c1..b2113ebc 100644
--- a/main.nf
+++ b/main.nf
@@ -118,9 +118,8 @@ workflow MIKROKONDO {
                             "sample": trimmed_name,
                             "external_id": trimmed_name]
 
-                        def inx_sample_p = trimmed_name.contains(inx_string_suffix)
-                        println "inx_sample_p: ${}"
-                        if(inx_sample_p){
+                        def inx_sample_p = trimmed_name.indexOf(inx_string_suffix)
+                        if(inx_sample_p > 0){ // -1 was not being evaluated as true
                             def inx_id = trimmed_name.substring(inx_sample_p + inx_string_suffix.length(), trimmed_name.length())
                             trimmed_name = trimmed_name.substring(0, inx_sample_p)
                             output_map.id = trimmed_name

From c036fb5025a7cfc4ced6b34227dd601cacf57620 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 29 Oct 2024 12:48:57 -0500
Subject: [PATCH 08/32] updated tests

---
 CHANGELOG.md                                  |  2 +
 assets/schema_input.json                      |  4 +-
 ...st-from-assemblies-vibrio-stupid-names.csv |  2 +
 tests/pipelines/main.from_assemblies.nf.test  | 62 +++++++++++++++++++
 4 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 96f46bc5..58aa5c80 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
 
+- Sample names (`sample_name` field) can no longer begin with a period. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
+
 ### `Updated`
 
 - Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123)
diff --git a/assets/schema_input.json b/assets/schema_input.json
index 01aeca1d..2ca6a15e 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -16,8 +16,8 @@
             },
             "sample_name": {
                 "type": "string",
-                "pattern": "^\\S+$",
-                "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next.",
+                "pattern": "^[^\\.]\\S+$",
+                "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.",
                 "meta": ["id"]
             },
             "fastq_1": {
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
new file mode 100644
index 00000000..c5215eac
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
@@ -0,0 +1,2 @@
+sample,sample_name,fastq_1,fastq_2,long_reads,assembly
+INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
diff --git a/tests/pipelines/main.from_assemblies.nf.test b/tests/pipelines/main.from_assemblies.nf.test
index 99946300..652f21c2 100644
--- a/tests/pipelines/main.from_assemblies.nf.test
+++ b/tests/pipelines/main.from_assemblies.nf.test
@@ -796,4 +796,66 @@ nextflow_pipeline {
         }
     }
 
+        test("Test Stupid Name in Input Sheet") {
+        tag "from_assemblies_stupidnames"
+
+        when {
+            params {
+                // Need to update with 7 gene when complete
+                input = "$baseDir/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv"
+                outdir = "results"
+
+                platform = "illumina"
+
+                mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/mash.msh"
+                dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+                kraken2_db = "$baseDir/tests/data/kraken2/test"
+                lx_allele_database = "$baseDir/tests/data/databases/locidex_dbs"
+                qt_min_contig_length = 1
+
+                skip_bakta = true
+                skip_mobrecon = true
+                skip_checkm = true
+                skip_raw_read_metrics = true
+                skip_polishing = true
+                skip_mlst = true
+                skip_version_gathering = true
+                skip_staramr = true
+                skip_length_filtering_contigs = true
+
+                skip_subtyping = false
+                skip_allele_calling = false
+                lx_report_max_stop = 100
+                max_memory = "2.GB"
+                max_cpus = 1
+            }
+        }
+
+        then {
+            assert workflow.success
+            assert path("$launchDir/results").exists()
+
+            // Compare IRIDANext json
+            assert path("$launchDir/results/iridanext.output.json").exists()
+            def iridanext_json = path("$launchDir/results/iridanext.output.json").json
+            def iridanext_global = iridanext_json.files.global
+            def iridanext_samples = iridanext_json.files.samples
+            def iridanext_metadata = iridanext_json.metadata.samples
+            def vibrio_metadata = iridanext_metadata.INX
+
+            // Output files
+            assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1
+            assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1
+
+            assert vibrio_metadata.locidex_db_name == "Vibrio cholerae"
+            assert vibrio_metadata.locidex_db_date == "2024-07-30"
+            assert vibrio_metadata.locidex_db_version == "1.0.0"
+            assert vibrio_metadata.total_loci == 7
+            assert vibrio_metadata.count_loci_found == 6
+            assert vibrio_metadata.count_loci_missing == 1
+
+        }
+    }
+
+
 }

From 6cb6d8c9cfc8e3efa5f8b778b62ed7cef39006b5 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 31 Oct 2024 13:08:49 -0500
Subject: [PATCH 09/32] updating commits for feedback

---
 assets/schema_input.json |  6 +++---
 docs/usage/usage.md      | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index 2ca6a15e..68802293 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -7,14 +7,14 @@
     "items": {
         "type": "object",
         "properties": {
-            "sample": {
+            "sample_name": {
                 "type": "string",
                 "pattern": "^[^\\s\\.]+$",
                 "meta": ["external_id"],
                 "unique": true,
                 "errorMessage": "This field cannot contain .iridanext_output."
             },
-            "sample_name": {
+            "sample": {
                 "type": "string",
                 "pattern": "^[^\\.]\\S+$",
                 "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.",
@@ -54,6 +54,6 @@
                 "unique": true
             }
         },
-        "required": ["sample_name"]
+        "required": ["sample"]
     }
 }
diff --git a/docs/usage/usage.md b/docs/usage/usage.md
index 09792343..34aeec1c 100644
--- a/docs/usage/usage.md
+++ b/docs/usage/usage.md
@@ -17,7 +17,7 @@ This pipeline requires sample files to be gzipped (symlinks may be problematic).
 ### Samplesheet (CSV)
 Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) contains the samples names and allows a user to combine read-sets based on that name if provided. The sample-sheet can utilize the following header fields:
 
-- sample_name
+- sample
 - fastq_1
 - fastq_2
 - long_reads
@@ -28,31 +28,31 @@ Example layouts for different sample-sheets include:
 
 _Illumina paired-end data_
 
-|sample_name|fastq_1|fastq_2|
+|sample|fastq_1|fastq_2|
 |------|-------|-------|
-|sample_name|path_to_forward_reads|path_to_reversed_reads|
+|sample|path_to_forward_reads|path_to_reversed_reads|
 
 _Nanopore_
 
-|sample_name|long_reads|
+|sample|long_reads|
 |------|----------|
-|sample_name|path_to_reads|
+|sample|path_to_reads|
 
 _Hybrid Assembly_
 
-|sample_name|fastq_1|fastq_2|long_reads|
+|sample|fastq_1|fastq_2|long_reads|
 |-------|-------|------|----------|
-|sample_name|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
+|sample|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
 
 _Starting with assembly only_
 
-|sample_name|assembly|
+|sample|assembly|
 |------|--------|
-|sample_name|path_to_assembly|
+|sample|path_to_assembly|
 
 _Example merging paired-end data_
 
-|sample_name|fastq_1|fastq_2|
+|sample|fastq_1|fastq_2|
 |------|-------|-------|
 |my_sample|path_to_forward_reads_1|path_to_reversed_reads_1|
 |my_sample|path_to_forward_reads_2|path_to_reversed_reads_2|

From db34308797a4043a138ece4a86bc9b0351a94d86 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 31 Oct 2024 13:20:55 -0500
Subject: [PATCH 10/32] updated samplesheets

---
 tests/data/samplesheets/samplesheet-campy-staph.csv             | 2 +-
 tests/data/samplesheets/samplesheet-small-assembly.csv          | 2 +-
 tests/data/samplesheets/samplesheet-small-metagenomic.csv       | 2 +-
 .../samplesheets/samplesheet-test-from-assemblies-listeria.csv  | 2 +-
 .../samplesheet-test-from-assemblies-salmonella.csv             | 2 +-
 .../samplesheet-test-from-assemblies-vibrio-stupid-names.csv    | 2 +-
 .../samplesheets/samplesheet-test-from-assemblies-vibrio.csv    | 2 +-
 tests/data/samplesheets/samplesheet-test-from-assemblies.csv    | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/data/samplesheets/samplesheet-campy-staph.csv b/tests/data/samplesheets/samplesheet-campy-staph.csv
index 8bb1350f..203b4804 100644
--- a/tests/data/samplesheets/samplesheet-campy-staph.csv
+++ b/tests/data/samplesheets/samplesheet-campy-staph.csv
@@ -1,2 +1,2 @@
-sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,fastq_1,fastq_2,long_reads,assembly
 CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
diff --git a/tests/data/samplesheets/samplesheet-small-assembly.csv b/tests/data/samplesheets/samplesheet-small-assembly.csv
index 8ea218c7..bc658f43 100644
--- a/tests/data/samplesheets/samplesheet-small-assembly.csv
+++ b/tests/data/samplesheets/samplesheet-small-assembly.csv
@@ -1,2 +1,2 @@
-sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,fastq_1,fastq_2,long_reads,assembly
 short,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
diff --git a/tests/data/samplesheets/samplesheet-small-metagenomic.csv b/tests/data/samplesheets/samplesheet-small-metagenomic.csv
index d245aff8..3e341b9b 100644
--- a/tests/data/samplesheets/samplesheet-small-metagenomic.csv
+++ b/tests/data/samplesheets/samplesheet-small-metagenomic.csv
@@ -1,2 +1,2 @@
-sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,fastq_1,fastq_2,long_reads,assembly
 meta-small,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv
index 7bc21370..69236531 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-listeria.csv
@@ -1,2 +1,2 @@
-sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,fastq_1,fastq_2,long_reads,assembly
 listeria_GCF_000196035,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/listeria/GCF_000196035.1_ASM19603v1_genomic.fna.gz
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv
index d9ca7157..2526bbe5 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-salmonella.csv
@@ -1,2 +1,2 @@
-sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,fastq_1,fastq_2,long_reads,assembly
 salmonella_GCA_000008105,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/salmonella/GCA_000008105.1_ASM810v1_genomic.fna.gz
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
index c5215eac..eb265562 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
@@ -1,2 +1,2 @@
-sample,sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,sample,fastq_1,fastq_2,long_reads,assembly
 INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv
index f475bdc8..98a1f026 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio.csv
@@ -1,2 +1,2 @@
-sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,fastq_1,fastq_2,long_reads,assembly
 st_120,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv
index 4de3619c..ba8e235d 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies.csv
@@ -1,2 +1,2 @@
-sample_name,fastq_1,fastq_2,long_reads,assembly
+sample,fastq_1,fastq_2,long_reads,assembly
 ecoli_GCA_000947975,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/genomes/ecoli/GCA_000947975.1_ASM94797v1_genomic.fna.gz

From 0c6e6d1630a725527ab1b773c4bb5f074e9fcd34 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 31 Oct 2024 13:55:52 -0500
Subject: [PATCH 11/32] updated sample sheet name

---
 .../samplesheet-test-from-assemblies-vibrio-stupid-names.csv    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
index eb265562..0a1d49a8 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
@@ -1,2 +1,2 @@
-sample,sample,fastq_1,fastq_2,long_reads,assembly
+sample_name,sample,fastq_1,fastq_2,long_reads,assembly
 INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz

From d1e5609f61223017a7d6ce2d14bec15015ba1059 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 31 Oct 2024 16:25:26 -0500
Subject: [PATCH 12/32] updated external_id parsing, tests will fail as path
 locations need to be updated

---
 bin/report_summaries.py           | 17 +++++++++--------
 main.nf                           | 13 ++-----------
 modules/local/report_aggregate.nf |  4 ++--
 nextflow.config                   |  1 -
 4 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/bin/report_summaries.py b/bin/report_summaries.py
index 8ee4c064..7686082c 100755
--- a/bin/report_summaries.py
+++ b/bin/report_summaries.py
@@ -39,8 +39,7 @@ class JsonImport:
     __key_delimiter = "."
     __inx_irida_key = "meta.external_id"
 
-    def __init__(self, report_fp, output_name, sample_suffix, inx_id_token):
-        self.inx_id_token = inx_id_token
+    def __init__(self, report_fp, output_name, sample_suffix):
         self.tool_data = None
         self.output_name = output_name
         self.output_transposed = os.path.splitext(os.path.basename(self.output_name))[0] + "_transposed.tsv"
@@ -265,13 +264,16 @@ def output_indv_json(self, flattened_data):
         """
         updated_items = dict()
         for k, v in flattened_data.items():
-            out_path = os.path.join(self.output_dir, k + self.flat_sample_string)
+            #out_path = os.path.join(self.output_dir, k + self.flat_sample_string)
             out_key = k
+            sample_dir = k
             if v.get(self.__inx_irida_key) != k:
-                inx_id = v[self.__inx_irida_key]
+                sample_dir = v[self.__inx_irida_key]
                 #! this field affects the identification of the irida next id being passed out of the pipeline
-                out_path = os.path.join(self.output_dir, k + self.inx_id_token + inx_id + self.flat_sample_string)
-                out_key = inx_id # this field must be overwritten for iridanext to identify the correct metdata field
+                out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field
+            out_path = os.path.join(self.output_dir, sample_dir, k + self.flat_sample_string)
+            if not os.is_dir(out_path): # Check for directory existence, as it will still exist on pipeline resumes
+                os.mkdir(out_path)
 
             with open(out_path, "w") as output:
                 json_data = json.dumps({k: v}, indent=2)
@@ -310,10 +312,9 @@ def main(args_in):
     parser.add_argument("-f", "--file-in", help="Path to the mikrokondo json summary")
     parser.add_argument("-s", "--sample-tag", help="Optional suffix and extension to name output samples.", default=default_samp_suffix)
     parser.add_argument("-o", "--out-file", help="output name plus the .tsv extension e.g. prefix.tsv")
-    parser.add_argument("-x", "--inx-id-token", help="A token to insert into the flattened json file names for separation of the irida next sample id.")
     args = parser.parse_args(args_in)
     if os.path.isfile(args.file_in):
-        JsonImport(args.file_in, args.out_file, args.sample_tag, args.inx_id_token)
+        JsonImport(args.file_in, args.out_file, args.sample_tag)
     else:
         sys.stderr.write(f"{args.file_in} does not exist.\n")
         sys.exit(-1)
diff --git a/main.nf b/main.nf
index b2113ebc..cd761b7e 100644
--- a/main.nf
+++ b/main.nf
@@ -110,22 +110,13 @@ workflow MIKROKONDO {
 
         updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{
                     sample ->
-                        def inx_string_suffix = params.report_aggregate.inx_string_insertion
                         def name_trim = sample.getName()
                         def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length())
+                        def external_id_name = trimmed_name.getParent().getBaseName()
                         def output_map = [
                             "id": trimmed_name,
                             "sample": trimmed_name,
-                            "external_id": trimmed_name]
-
-                        def inx_sample_p = trimmed_name.indexOf(inx_string_suffix)
-                        if(inx_sample_p > 0){ // -1 was not being evaluated as true
-                            def inx_id = trimmed_name.substring(inx_sample_p + inx_string_suffix.length(), trimmed_name.length())
-                            trimmed_name = trimmed_name.substring(0, inx_sample_p)
-                            output_map.id = trimmed_name
-                            output_map.sample = trimmed_name
-                            output_map.external_id = inx_id
-                        }
+                            "external_id": external_id_name]
 
                         tuple(output_map, sample)
                     }
diff --git a/modules/local/report_aggregate.nf b/modules/local/report_aggregate.nf
index 3a2cf787..b4a6e180 100644
--- a/modules/local/report_aggregate.nf
+++ b/modules/local/report_aggregate.nf
@@ -14,13 +14,13 @@ process REPORT_AGGREGATE{
     path("final_report.tsv"), emit: final_report
     path("final_report_transposed.tsv"), emit: final_report_transposed
     path("final_report_flattened.json"), emit: flattened_files
-    path("*${sample_flat_suffix}"), emit: flat_samples
+    path("*/*${sample_flat_suffix}"), emit: flat_samples
     path "versions.yml", emit: versions
 
     script:
     sample_flat_suffix = params.report_aggregate.sample_flat_suffix
     """
-    report_summaries.py -f ${summary_report} -o final_report.tsv -s ${sample_flat_suffix} -x ${params.report_aggregate.inx_string_insertion}
+    report_summaries.py -f ${summary_report} -o final_report.tsv -s ${sample_flat_suffix}
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python --version | sed 's/Python //g')
diff --git a/nextflow.config b/nextflow.config
index c4a13e53..026a2444 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -716,7 +716,6 @@ params {
 
     report_aggregate {
         sample_flat_suffix = ".flat_sample.json"
-        inx_string_insertion = ".iridanext_output."
     }
 
 

From a48fb95052cad75fa8857db8326688a74c289263 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Fri, 1 Nov 2024 13:26:31 -0500
Subject: [PATCH 13/32] updated output of flattened sample reports

---
 bin/report_summaries.py | 8 +++++---
 main.nf                 | 2 +-
 tests/main.nf.test      | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/bin/report_summaries.py b/bin/report_summaries.py
index 7686082c..73b17617 100755
--- a/bin/report_summaries.py
+++ b/bin/report_summaries.py
@@ -271,9 +271,11 @@ def output_indv_json(self, flattened_data):
                 sample_dir = v[self.__inx_irida_key]
                 #! this field affects the identification of the irida next id being passed out of the pipeline
                 out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field
-            out_path = os.path.join(self.output_dir, sample_dir, k + self.flat_sample_string)
-            if not os.is_dir(out_path): # Check for directory existence, as it will still exist on pipeline resumes
-                os.mkdir(out_path)
+
+            out_dir = os.path.join(self.output_dir, sample_dir)
+            out_path = os.path.join(out_dir, k + self.flat_sample_string)
+            if not os.path.isdir(out_dir): # Check for directory existence, as it will still exist on pipeline resumes
+                os.mkdir(out_dir)
 
             with open(out_path, "w") as output:
                 json_data = json.dumps({k: v}, indent=2)
diff --git a/main.nf b/main.nf
index cd761b7e..ad2e991c 100644
--- a/main.nf
+++ b/main.nf
@@ -112,7 +112,7 @@ workflow MIKROKONDO {
                     sample ->
                         def name_trim = sample.getName()
                         def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length())
-                        def external_id_name = trimmed_name.getParent().getBaseName()
+                        def external_id_name = sample.getParent().getBaseName() //Calling getBaseName after getParent does not work as the output items is a string not Path
                         def output_map = [
                             "id": trimmed_name,
                             "sample": trimmed_name,
diff --git a/tests/main.nf.test b/tests/main.nf.test
index 784fa7be..5d741ee6 100644
--- a/tests/main.nf.test
+++ b/tests/main.nf.test
@@ -216,7 +216,7 @@ nextflow_pipeline {
             assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
             assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
             assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.iridanext_output.INX.flat_sample.json.gz" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1
 
             // output metadata
             assert iridanext_metadata.INX."QC Status" == "PASSED"

From 39c8505824de4d5dbd577b7674858dc121103ce3 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Fri, 1 Nov 2024 13:27:41 -0500
Subject: [PATCH 14/32] fixed erroneous comment

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index ad2e991c..17f643c7 100644
--- a/main.nf
+++ b/main.nf
@@ -112,7 +112,7 @@ workflow MIKROKONDO {
                     sample ->
                         def name_trim = sample.getName()
                         def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length())
-                        def external_id_name = sample.getParent().getBaseName() //Calling getBaseName after getParent does not work as the output items is a string not Path
+                        def external_id_name = sample.getParent().getBaseName()
                         def output_map = [
                             "id": trimmed_name,
                             "sample": trimmed_name,

From 14653fbae6871f0cfd0f226da4d9b1feeb24ab84 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 5 Nov 2024 16:10:13 -0600
Subject: [PATCH 15/32] updated sample field orders

---
 assets/schema_input.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index 68802293..068bf113 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -7,18 +7,18 @@
     "items": {
         "type": "object",
         "properties": {
-            "sample_name": {
+            "sample": {
                 "type": "string",
-                "pattern": "^[^\\s\\.]+$",
-                "meta": ["external_id"],
+                "pattern": "^[^\\.]\\S+$",
+                "meta": ["id"],
                 "unique": true,
-                "errorMessage": "This field cannot contain .iridanext_output."
+                "errorMessage": "Sample name to be used in report generation."
             },
-            "sample": {
+            "sample_name": {
                 "type": "string",
                 "pattern": "^[^\\.]\\S+$",
                 "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.",
-                "meta": ["id"]
+                "meta": ["external_id"]
             },
             "fastq_1": {
                 "type": "string",

From 9c0bad45a544a8e1f8d84c48418b6d2830b9eacc Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 5 Nov 2024 16:29:12 -0600
Subject: [PATCH 16/32] updated logic for renaming sample id

---
 subworkflows/local/input_check.nf | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index b5d2ebc3..c8f9a39b 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -19,7 +19,11 @@ workflow INPUT_CHECK {
         parameters_schema: 'nextflow_schema.json',
         skip_duplicate_check: true).map {
             // Create grouping value
-            meta -> tuple(meta.id[0], meta[0])
+            meta -> println meta
+                if (meta[0].external_id != null) {
+                    meta[0].id = meta[0].external_id
+                }
+                tuple(meta[0].id, meta[0])
         }
 
     if(params.opt_platforms.ont == params.platform && params.nanopore_chemistry == null){

From c8827fe61116be560d84b1c6301bcae11cb5a41c Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Wed, 6 Nov 2024 13:38:03 -0600
Subject: [PATCH 17/32] updated sample parsing

---
 assets/schema_input.json          |  7 +++----
 nextflow.config                   |  3 ++-
 subworkflows/local/input_check.nf | 29 +++++++++++++++++++++++++++--
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index 068bf113..fe23e4c3 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -9,15 +9,14 @@
         "properties": {
             "sample": {
                 "type": "string",
-                "pattern": "^[^\\.]\\S+$",
+                "pattern": "^\\S+$",
                 "meta": ["id"],
                 "unique": true,
-                "errorMessage": "Sample name to be used in report generation."
+                "errorMessage": "Sample name to be used in report generation. Invalid characters are replaces with underscores."
             },
             "sample_name": {
                 "type": "string",
-                "pattern": "^[^\\.]\\S+$",
-                "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Input cannot start with period.",
+                "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Invalid characters will be replaced with underscores.",
                 "meta": ["external_id"]
             },
             "fastq_1": {
diff --git a/nextflow.config b/nextflow.config
index 026a2444..86c15d4a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -44,10 +44,11 @@ params {
     show_hidden_params = false
     validationS3PathCheck = true
     validationShowHiddenParams = false
-    validationSchemaIgnoreParams = '__in_iridanext,rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
+    validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
     validationFailUnrecognisedParams = false // for the qcreport fields
 
     // SKIP options
+    skip_read_merging = true
     skip_report = false
     skip_raw_read_metrics = false
     skip_version_gathering = false
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index c8f9a39b..c9011aaf 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -14,15 +14,40 @@ workflow INPUT_CHECK {
 
     versions = Channel.empty()
     def sample_sheet = params.input
+
+    // Thank you snvphylnfc for the ideas :)
+    // https://github.com/phac-nml/snvphylnfc/blob/f1e5fae76af276acf0a8c98174978cb21ca5d7e0/workflows/snvphylnfc.nf#L98-L109
+    def processedIDs = [] as Set
+
     reads_in = Channel.fromSamplesheet(
         "input", // apparentely input maps to params.input...
         parameters_schema: 'nextflow_schema.json',
         skip_duplicate_check: true).map {
             // Create grouping value
-            meta -> println meta
+            meta ->
+                // Remove any unallowed charactars in the meta.id field
+                meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
+
                 if (meta[0].external_id != null) {
-                    meta[0].id = meta[0].external_id
+                    // remove any charactars in the external_id that should not be used
+                    meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
                 }
+
+                if(processedIDs.contains(meta.id) && params.skip_read_merging){
+                    // If the id is already contained and read merging is not to be
+                    // performed, then we make the id's unique to proceed with processing
+                    // read merging is set to false by default, so that when it is run
+                    // in IRIDANext reads are only merged in irida next
+                    while (processedIDs.contains(meta.id)) {
+                        meta.id = "${meta.id}_${meta.external_id}"
+                    }
+                }else{
+                    // Set the external id to the input ID.
+                    meta[0].external_id = meta[0].id
+                }
+
+
+                processedIDs << meta.id
                 tuple(meta[0].id, meta[0])
         }
 

From db5f420e82e289cd596817f48772b2e3323438a4 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Wed, 6 Nov 2024 13:56:03 -0600
Subject: [PATCH 18/32] updated docs, changelog and nextflow_schema.json

---
 CHANGELOG.md         | 2 ++
 docs/usage/usage.md  | 3 +++
 nextflow_schema.json | 6 ++++++
 3 files changed, 11 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 58aa5c80..98c6172b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added a new field to the `schema_input.json` file to allow for sample ID's from external systems such as IRIDA Next: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
 
+- Incorporated a `--skip_read_merging` parameter to prevent read merging [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+
 ### `Changed`
 
 - Added a `sample_name` field, `sample` still exists but is used for different purposes [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
diff --git a/docs/usage/usage.md b/docs/usage/usage.md
index 34aeec1c..eff83863 100644
--- a/docs/usage/usage.md
+++ b/docs/usage/usage.md
@@ -23,6 +23,7 @@ Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) con
 - long_reads
 - assembly
 
+> **Note:** Illegal characters (e.g. characters that match the expression [^A-Za-z0-9_.\-] ) in the sample name will be replaced with underscores.
 
 Example layouts for different sample-sheets include:
 
@@ -96,6 +97,8 @@ _Example merging paired-end data_
 Numerous steps within mikrokondo can be turned off without compromising the stability of the pipeline. This skip options can reduce run-time of the pipeline or allow for completion of the pipeline despite errors.
 ** All of the above options can be turned on by entering `--{skip_option} true` in the command line arguments to the pipeline (where optional parameters can be added)**
 
+
+- `--skip_read_merging`: Do not merge reads, if duplicate sample names are present the names will be made unique.
 - `--skip_abricate`: turn off abricate AMR detection
 - `--skip_bakta`: turn off bakta annotation pipeline (generally a slow step, requiring a database to be specified).
 - `--skip_checkm`: used as part of the contamination detection within mikrokondo, its run time and resource usage can be quite lengthy.
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 3de0abcb..70bd6ab5 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -380,6 +380,12 @@
                     "type": "boolean",
                     "description": "Do not enter the subtyping workflow, e.g. ECTyper, SISTR etc will not be ran."
                 },
+                "skip_read_merging": {
+                    "type": "boolean",
+                    "default": true,
+                    "description": "Do not merge reads",
+                    "hidden": true
+                },
                 "skip_bakta": {
                     "type": "boolean",
                     "default": true,

From 52af4a9ed5839a39402993b91577373d2432898c Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Wed, 6 Nov 2024 15:06:39 -0600
Subject: [PATCH 19/32] updated test cases

---
 tests/main.nf.test | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/main.nf.test b/tests/main.nf.test
index 5d741ee6..ef559da2 100644
--- a/tests/main.nf.test
+++ b/tests/main.nf.test
@@ -209,17 +209,17 @@ nextflow_pipeline {
             // output files
             assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1
             assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
-            assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
+            assert iridanext_samples.short.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1
 
             // output metadata
-            assert iridanext_metadata.INX."QC Status" == "PASSED"
+            assert iridanext_metadata.short."QC Status" == "PASSED"
 
         }
 

From eb759696e4d2ac85fd317047c18b5500a7f569a4 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Wed, 6 Nov 2024 16:29:03 -0600
Subject: [PATCH 20/32] updating inputcheck tests

---
 assets/schema_input.json                           |  1 -
 modules/local/combine_data.nf                      |  8 ++++----
 subworkflows/local/input_check.nf                  | 12 +++---------
 tests/data/samplesheets/samplesheet-merge-test.csv |  4 ++++
 4 files changed, 11 insertions(+), 14 deletions(-)
 create mode 100644 tests/data/samplesheets/samplesheet-merge-test.csv

diff --git a/assets/schema_input.json b/assets/schema_input.json
index fe23e4c3..fd8b8c2a 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -11,7 +11,6 @@
                 "type": "string",
                 "pattern": "^\\S+$",
                 "meta": ["id"],
-                "unique": true,
                 "errorMessage": "Sample name to be used in report generation. Invalid characters are replaces with underscores."
             },
             "sample_name": {
diff --git a/modules/local/combine_data.nf b/modules/local/combine_data.nf
index cf76dded..c03d2083 100644
--- a/modules/local/combine_data.nf
+++ b/modules/local/combine_data.nf
@@ -20,16 +20,16 @@ process COMBINE_DATA{
     def fields_merge = meta.fields_merge
 
     if(fastq_1){
-        cmd_ << "cat ${meta.fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
+        cmd_ << "cat ${fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
     }
     if(fastq_2){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
+        cmd_ << "cat ${fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
     }
     if(long_reads){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
+        cmd_ << "cat ${long_reads.join(' ')} > out/${prefix}.merged.fastq.gz;"
     }
     if(assembly){
-        cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
+        cmd_ << "cat ${assembly.join(' ')} > out/${prefix}.merged.fastq.gz;"
     }
     def cmd = cmd_.join("\n")
     // creating dummy outputs so that all outputs exist for any scenario
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index c9011aaf..23668c95 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -6,8 +6,6 @@ include { COMBINE_DATA } from '../../modules/local/combine_data.nf'
 include { fromSamplesheet } from 'plugin/nf-validation'
 
 
-
-
 workflow INPUT_CHECK {
 
     main:
@@ -96,7 +94,7 @@ workflow INPUT_CHECK {
     versions = versions // channel: [ versions.yml ]
 }
 
-def reset_combined_map(LinkedHashMap meta, sun.nio.fs.UnixPath f_reads, sun.nio.fs.UnixPath r_reads, sun.nio.fs.UnixPath long_reads, sun.nio.fs.UnixPath assembly){
+def reset_combined_map(LinkedHashMap meta, Path f_reads, Path r_reads, Path long_reads, Path assembly){
     /*Re-format the data to make it similar to make it match the input format again
 
     */
@@ -124,7 +122,7 @@ def reset_combined_map(LinkedHashMap meta, sun.nio.fs.UnixPath f_reads, sun.nio.
 
 def check_file_exists(def file_path){
     if(!file(file_path).exists()){
-        exit 1, "ERROR: Please check input samplesheet -> $file_path does not exist. If your file in you sample sheet does not exist make sure you do not have spaces in your path name."
+        exit 1, "ERROR: Please check input samplesheet -> $file_path does not exist. Check that you do not have spaces in your path."
     }
     return true
 }
@@ -134,10 +132,6 @@ def format_reads(ArrayList sheet_data){
     def error_occured = false
     meta.id = sheet_data[0] // id is first value
     meta.sample = sheet_data[0] // Sample will be id currently
-    meta.external_id = sheet_data[0] // This is duplicated to keep later scripting cleaner
-    if(sheet_data[1].external_id != null){
-        meta.external_id = sheet_data[1].external_id
-    }
 
     meta.hybrid = false
     meta.assembly = false
@@ -222,7 +216,7 @@ def group_reads(ArrayList read_data){
                 reads_combine[item] = []
             }
             if(group[item] && check_file_exists(group[item])){
-                reads_combine[item] << group[item]
+                reads_combine[item] << file(group[item])
             }
         }
     }
diff --git a/tests/data/samplesheets/samplesheet-merge-test.csv b/tests/data/samplesheets/samplesheet-merge-test.csv
new file mode 100644
index 00000000..1d275345
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-merge-test.csv
@@ -0,0 +1,4 @@
+sample,fastq_1,fastq_2,long_reads,assembly
+CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
+CSE,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
+un-merged,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,

From 45ce5a2ee59545bf8f15da5c0356679f1b70ae2f Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Wed, 6 Nov 2024 16:29:20 -0600
Subject: [PATCH 21/32] added missing files

---
 .../local/input_check/input_check.nf.test     | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 tests/subworkflows/local/input_check/input_check.nf.test

diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test
new file mode 100644
index 00000000..beccd3f3
--- /dev/null
+++ b/tests/subworkflows/local/input_check/input_check.nf.test
@@ -0,0 +1,76 @@
+nextflow_workflow {
+    name "Test workflow INPUT_CHECK"
+    script "subworkflows/local/input_check.nf"
+    workflow "INPUT_CHECK"
+    tag "subworkflow"
+    tag "input_check"
+
+    test("Test input check") {
+        tag "pass_input_screen"
+
+        when {
+
+            params {
+                input = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/samplesheets/samplesheet-campy-staph.csv"
+                outdir = "results"
+                min_reads = 1
+                mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
+                mh_min_kmer = 1
+
+                dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+
+                kraken2_db = "$baseDir/tests/data/kraken2/test"
+
+
+                max_memory = "2.GB"
+                max_cpus = 1
+            }
+
+        }
+
+        then {
+            assert workflow.success
+            assert workflow.out.reads == [[['id':'CSE', 'sample':'CSE', 'hybrid':false, 'assembly':false, 'downsampled':false, 'single_end':false, 'merge':false], ['/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz', '/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz']]]
+        }
+    }
+
+    test("Test reads are merged") {
+    tag "pass_merge_reads"
+
+        when {
+
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet-merge-test.csv"
+                outdir = "results"
+                min_reads = 1
+                mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
+                mh_min_kmer = 1
+                skip_read_merging = false
+
+                dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+
+                kraken2_db = "$baseDir/tests/data/kraken2/test"
+
+
+                max_memory = "2.GB"
+                max_cpus = 1
+            }
+
+        }
+
+        then {
+            assert workflow.success
+            assert workflow.out.reads[0][0].id == 'CSE'
+            assert workflow.out.reads[0][0].merge == true
+            assert workflow.out.reads[0][1][0].endsWith("CSE_R1.merged.fastq.gz")
+            assert workflow.out.reads[0][1][1].endsWith("CSE_R2.merged.fastq.gz")
+
+            assert workflow.out.reads[1][0].id == 'un-merged'
+            assert workflow.out.reads[1][0].merge == false
+            assert workflow.out.reads[1][1][0].endsWith("metagenomic_reads1.fq.gz")
+            assert workflow.out.reads[1][1][1].endsWith("metagenomic_reads2.fq.gz")
+        }
+
+    }
+
+}

From eec62b329391ca5543b9a67e820c9a4afc806635 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Wed, 6 Nov 2024 16:46:59 -0600
Subject: [PATCH 22/32] updated tests

---
 .../samplesheets/samplesheet-set-ext-id.csv   |  5 +++
 .../local/input_check/input_check.nf.test     | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 tests/data/samplesheets/samplesheet-set-ext-id.csv

diff --git a/tests/data/samplesheets/samplesheet-set-ext-id.csv b/tests/data/samplesheets/samplesheet-set-ext-id.csv
new file mode 100644
index 00000000..e0d02480
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-set-ext-id.csv
@@ -0,0 +1,5 @@
+sample,sample_name,fastq_1,fastq_2,long_reads,assembly
+CSE,better.faster.stronger.name,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
+CSE2,an even stronger name!,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
+unique2,this is getting ridiculous,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
+unique3,this is getting ridiculous,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads.fastq,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq,,
diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test
index beccd3f3..6982dff5 100644
--- a/tests/subworkflows/local/input_check/input_check.nf.test
+++ b/tests/subworkflows/local/input_check/input_check.nf.test
@@ -73,4 +73,36 @@ nextflow_workflow {
 
     }
 
+    test("Test external id is set") {
+    tag "pass_set_external"
+
+        when {
+
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet-set-ext-id.csv"
+                outdir = "results"
+                min_reads = 1
+                mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
+                mh_min_kmer = 1
+                skip_read_merging = false
+
+                dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+
+                kraken2_db = "$baseDir/tests/data/kraken2/test"
+
+
+                max_memory = "2.GB"
+                max_cpus = 1
+            }
+
+        }
+
+        then {
+            assert workflow.success
+            println workflow.out.reads
+            //assert workflow.out.reads[0][0].id == "better_faster_stronger_name"
+        }
+
+    }
+
 }

From 733db44548d87d15a411c21d4c8b922031dfe047 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 7 Nov 2024 12:47:50 -0600
Subject: [PATCH 23/32] fixed failing tests

---
 subworkflows/local/input_check.nf                        | 4 ++++
 tests/subworkflows/local/input_check/input_check.nf.test | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 23668c95..462f7c2d 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -29,8 +29,11 @@ workflow INPUT_CHECK {
                 if (meta[0].external_id != null) {
                     // remove any charactars in the external_id that should not be used
                     meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
+                }else{
+                    meta[0].external_id = meta[0].id
                 }
 
+
                 if(processedIDs.contains(meta.id) && params.skip_read_merging){
                     // If the id is already contained and read merging is not to be
                     // performed, then we make the id's unique to proceed with processing
@@ -132,6 +135,7 @@ def format_reads(ArrayList sheet_data){
     def error_occured = false
     meta.id = sheet_data[0] // id is first value
     meta.sample = sheet_data[0] // Sample will be id currently
+    meta.external_id = sheet_data[1].external_id
 
     meta.hybrid = false
     meta.assembly = false
diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test
index 6982dff5..8fff14fa 100644
--- a/tests/subworkflows/local/input_check/input_check.nf.test
+++ b/tests/subworkflows/local/input_check/input_check.nf.test
@@ -98,6 +98,8 @@ nextflow_workflow {
         }
 
         then {
+
+            // TODO beef up assertions
             assert workflow.success
             println workflow.out.reads
             //assert workflow.out.reads[0][0].id == "better_faster_stronger_name"
@@ -105,4 +107,6 @@ nextflow_workflow {
 
     }
 
+    // TODO add test case for making read ids unique
+
 }

From 71260a99ab1334d0ffcafb008e673dce804c973a Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 7 Nov 2024 14:19:00 -0600
Subject: [PATCH 24/32] updated tests

---
 subworkflows/local/input_check.nf             | 13 +++---
 .../samplesheet-make-names-unique.csv         |  5 ++
 .../local/input_check/input_check.nf.test     | 46 +++++++++++++++++--
 3 files changed, 54 insertions(+), 10 deletions(-)
 create mode 100644 tests/data/samplesheets/samplesheet-make-names-unique.csv

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 462f7c2d..b8c327d3 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -23,24 +23,25 @@ workflow INPUT_CHECK {
         skip_duplicate_check: true).map {
             // Create grouping value
             meta ->
+
                 // Remove any unallowed charactars in the meta.id field
-                meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
+                meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\-]/, '_')
 
                 if (meta[0].external_id != null) {
                     // remove any charactars in the external_id that should not be used
-                    meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
+                    meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_\-]/, '_')
                 }else{
                     meta[0].external_id = meta[0].id
                 }
 
 
-                if(processedIDs.contains(meta.id) && params.skip_read_merging){
+                if(processedIDs.contains(meta[0].id) && params.skip_read_merging){
                     // If the id is already contained and read merging is not to be
                     // performed, then we make the id's unique to proceed with processing
                     // read merging is set to false by default, so that when it is run
                     // in IRIDANext reads are only merged in irida next
-                    while (processedIDs.contains(meta.id)) {
-                        meta.id = "${meta.id}_${meta.external_id}"
+                    while (processedIDs.contains(meta[0].id)) {
+                        meta[0].id = "${meta[0].id}_${meta[0].external_id}"
                     }
                 }else{
                     // Set the external id to the input ID.
@@ -48,7 +49,7 @@ workflow INPUT_CHECK {
                 }
 
 
-                processedIDs << meta.id
+                processedIDs << meta[0].id
                 tuple(meta[0].id, meta[0])
         }
 
diff --git a/tests/data/samplesheets/samplesheet-make-names-unique.csv b/tests/data/samplesheets/samplesheet-make-names-unique.csv
new file mode 100644
index 00000000..09d8672c
--- /dev/null
+++ b/tests/data/samplesheets/samplesheet-make-names-unique.csv
@@ -0,0 +1,5 @@
+sample,fastq_1,fastq_2,long_reads,assembly
+ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz,,
+ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq.gz,,
+ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads1.fq.gz,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/metagenomic_reads2.fq.gz,,
+ha,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads.fastq,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/reads/1_R2.fq,,
diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test
index 8fff14fa..dcf977dd 100644
--- a/tests/subworkflows/local/input_check/input_check.nf.test
+++ b/tests/subworkflows/local/input_check/input_check.nf.test
@@ -99,14 +99,52 @@ nextflow_workflow {
 
         then {
 
-            // TODO beef up assertions
             assert workflow.success
-            println workflow.out.reads
-            //assert workflow.out.reads[0][0].id == "better_faster_stronger_name"
+            assert workflow.out.reads[0][0].id == 'an_even_stronger_name_'
+            assert workflow.out.reads[0][0].merge == false
+            assert workflow.out.reads[1][0].id == 'better_faster_stronger_name'
+            assert workflow.out.reads[1][0].merge == false
+            assert workflow.out.reads[2][0].id == 'this_is_getting_ridiculous'
+            assert workflow.out.reads[2][0].merge == true
+            assert workflow.out.reads[2][1][0].endsWith("this_is_getting_ridiculous_R1.merged.fastq.gz")
+            assert workflow.out.reads[2][1][1].endsWith("this_is_getting_ridiculous_R2.merged.fastq.gz")
+        }
+
+    }
+
+    test("Test make ids unique") {
+    tag "pass_make_ids_unique"
+
+        when {
+
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet-make-names-unique.csv"
+                outdir = "results"
+                min_reads = 1
+                mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh"
+                mh_min_kmer = 1
+                skip_read_merging = true
+                dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi"
+
+                kraken2_db = "$baseDir/tests/data/kraken2/test"
+
+
+                max_memory = "2.GB"
+                max_cpus = 1
+            }
+
+        }
+
+        then {
+
+            assert workflow.success
+            assert workflow.out.reads[0][0].id = ha
+            assert workflow.out.reads[1][0].id = ha_ha
+            assert workflow.out.reads[2][0].id = ha_ha_ha
+            assert workflow.out.reads[3][0].id = ha_ha_ha_ha
         }
 
     }
 
-    // TODO add test case for making read ids unique
 
 }

From 3c4e1c4a61248797fd7e71c3f206bb0e279e546d Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 7 Nov 2024 14:26:07 -0600
Subject: [PATCH 25/32] fixed my own mistakes

---
 tests/subworkflows/local/input_check/input_check.nf.test | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test
index dcf977dd..ec28cc7e 100644
--- a/tests/subworkflows/local/input_check/input_check.nf.test
+++ b/tests/subworkflows/local/input_check/input_check.nf.test
@@ -138,10 +138,10 @@ nextflow_workflow {
         then {
 
             assert workflow.success
-            assert workflow.out.reads[0][0].id = ha
-            assert workflow.out.reads[1][0].id = ha_ha
-            assert workflow.out.reads[2][0].id = ha_ha_ha
-            assert workflow.out.reads[3][0].id = ha_ha_ha_ha
+            assert workflow.out.reads[0][0].id == 'ha'
+            assert workflow.out.reads[1][0].id == 'ha_ha'
+            assert workflow.out.reads[2][0].id == 'ha_ha_ha'
+            assert workflow.out.reads[3][0].id == 'ha_ha_ha_ha'
         }
 
     }

From 738943b31aa5fa286dcff056cb2937d252b684cc Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Thu, 7 Nov 2024 15:05:01 -0600
Subject: [PATCH 26/32] fixed failing test

---
 .../local/input_check/input_check.nf.test            | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test
index ec28cc7e..6eb54be9 100644
--- a/tests/subworkflows/local/input_check/input_check.nf.test
+++ b/tests/subworkflows/local/input_check/input_check.nf.test
@@ -30,7 +30,17 @@ nextflow_workflow {
 
         then {
             assert workflow.success
-            assert workflow.out.reads == [[['id':'CSE', 'sample':'CSE', 'hybrid':false, 'assembly':false, 'downsampled':false, 'single_end':false, 'merge':false], ['/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph1.fq.gz', '/phac-nml/mikrokondo/raw/dev/tests/data/reads/campy-staph2.fq.gz']]]
+            assert workflow.out.reads[0][0].id == 'CSE'
+            assert workflow.out.reads[0][0].external_id == 'CSE'
+            assert workflow.out.reads[0][0].sample == 'CSE'
+            assert workflow.out.reads[0][0].hybrid == false
+            assert workflow.out.reads[0][0].assembly == false
+            assert workflow.out.reads[0][0].downsampled == false
+            assert workflow.out.reads[0][0].single_end == false
+            assert workflow.out.reads[0][0].merge == false
+            assert workflow.out.reads[0][1][0].endsWith('campy-staph1.fq.gz')
+            assert workflow.out.reads[0][1][1].endsWith('campy-staph2.fq.gz')
+
         }
     }
 

From b1e60dd7e05670bcd6f542fd55c2fe2fe4220b59 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Fri, 8 Nov 2024 12:05:26 -0600
Subject: [PATCH 27/32] swapped external_id and id

---
 assets/schema_input.json          |  4 ++--
 subworkflows/local/input_check.nf | 24 +++++++++++-------------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index fd8b8c2a..39a3c830 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -10,13 +10,13 @@
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "meta": ["id"],
+                "meta": ["external_id"],
                 "errorMessage": "Sample name to be used in report generation. Invalid characters are replaces with underscores."
             },
             "sample_name": {
                 "type": "string",
                 "errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Invalid characters will be replaced with underscores.",
-                "meta": ["external_id"]
+                "meta": ["id"]
             },
             "fastq_1": {
                 "type": "string",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index b8c327d3..4385fb05 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -24,14 +24,16 @@ workflow INPUT_CHECK {
             // Create grouping value
             meta ->
 
-                // Remove any unallowed charactars in the meta.id field
-                meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\-]/, '_')
-
-                if (meta[0].external_id != null) {
-                    // remove any charactars in the external_id that should not be used
-                    meta[0].id = meta[0].external_id.replaceAll(/[^A-Za-z0-9_\-]/, '_')
-                }else{
-                    meta[0].external_id = meta[0].id
+                // Verify file names do not start with periods as the files can end up being treated as
+                // hidden files causing odd issues later on in the pipeline
+
+                if(meta[0].id == null){
+                    // Remove any unallowed charactars in the meta.id field
+                    meta[0].id = meta[0].external_id.replaceAll(/^\./, '_')
+                    meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
+                }else {
+                    meta[0].id = meta[0].id.replaceAll(/^\./, '_')
+                    meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
                 }
 
 
@@ -43,12 +45,8 @@ workflow INPUT_CHECK {
                     while (processedIDs.contains(meta[0].id)) {
                         meta[0].id = "${meta[0].id}_${meta[0].external_id}"
                     }
-                }else{
-                    // Set the external id to the input ID.
-                    meta[0].external_id = meta[0].id
                 }
 
-
                 processedIDs << meta[0].id
                 tuple(meta[0].id, meta[0])
         }
@@ -135,7 +133,7 @@ def format_reads(ArrayList sheet_data){
     def meta = [:]
     def error_occured = false
     meta.id = sheet_data[0] // id is first value
-    meta.sample = sheet_data[0] // Sample will be id currently
+    meta.sample = sheet_data[1].external_id
     meta.external_id = sheet_data[1].external_id
 
     meta.hybrid = false

From 70d02912b9a958334dc4f39105cd22a3b5c52637 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Fri, 8 Nov 2024 16:45:31 -0600
Subject: [PATCH 28/32] updating  information before the weekend

---
 CHANGELOG.md                                             | 2 +-
 modules/local/report.nf                                  | 8 +++++++-
 subworkflows/local/input_check.nf                        | 9 +++++----
 ...plesheet-test-from-assemblies-vibrio-stupid-names.csv | 4 ++--
 tests/subworkflows/local/input_check/input_check.nf.test | 2 +-
 5 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 98c6172b..62f2a563 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
 
-- Added a new field to the `schema_input.json` file to allow for sample ID's from external systems such as IRIDA Next: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+- Added a new `sample_name` field to the `schema_input.json` file: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
 
 - Incorporated a `--skip_read_merging` parameter to prevent read merging [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
 
diff --git a/modules/local/report.nf b/modules/local/report.nf
index 8a469955..f361a0f2 100644
--- a/modules/local/report.nf
+++ b/modules/local/report.nf
@@ -42,18 +42,22 @@ process REPORT{
         def report_value = test_in[i+2]
 
         if(!sample_data.containsKey(meta_data.sample)){
+            // Todo issue grabbing correct tag is here
             sample_data[meta_data.sample] = [:]
             sample_data[meta_data.sample]["meta"] = [:]
         }
 
         update_map_values(sample_data, meta_data, "metagenomic")
-        update_map_values(sample_data, meta_data, "external_id")
+        //update_map_values(sample_data, meta_data, "sample")
+        //update_map_values(sample_data, meta_data, "external_id")
+        //update_map_values(sample_data, meta_data, "id")
         update_map_values(sample_data, meta_data, "assembly")
         update_map_values(sample_data, meta_data, "hybrid")
         update_map_values(sample_data, meta_data, "single_end")
         update_map_values(sample_data, meta_data, "merge")
         update_map_values(sample_data, meta_data, "downsampled")
 
+
         if(!sample_data[meta_data.sample].containsKey(meta_data.id)){
             sample_data[meta_data.sample][meta_data.id] = [:]
         }
@@ -678,6 +682,8 @@ def generate_qc_data(data, search_phrases, qc_species_tag){
     def species_tag_location = 0
     for(k in data){
         if(!k.value.meta.metagenomic){
+            println k.value
+            println k.key
             def species = get_species(k.value[k.key][top_hit_tag], search_phrases, shortest_token)
             // update coverage first so its values can be used in generating qc messages
             generate_coverage_data(data[k.key], params.coverage_calc_fields.bp_field, species)
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 4385fb05..edbcb5f0 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -24,16 +24,17 @@ workflow INPUT_CHECK {
             // Create grouping value
             meta ->
 
+
                 // Verify file names do not start with periods as the files can end up being treated as
                 // hidden files causing odd issues later on in the pipeline
 
                 if(meta[0].id == null){
                     // Remove any unallowed charactars in the meta.id field
                     meta[0].id = meta[0].external_id.replaceAll(/^\./, '_')
-                    meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
+                    meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\.\-]/, '_')
                 }else {
                     meta[0].id = meta[0].id.replaceAll(/^\./, '_')
-                    meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_.\-]/, '_')
+                    meta[0].id = meta[0].id.replaceAll(/[^A-Za-z0-9_\.\-]/, '_')
                 }
 
 
@@ -46,7 +47,7 @@ workflow INPUT_CHECK {
                         meta[0].id = "${meta[0].id}_${meta[0].external_id}"
                     }
                 }
-
+                println "${meta[0].id} ${meta[0]}"
                 processedIDs << meta[0].id
                 tuple(meta[0].id, meta[0])
         }
@@ -134,7 +135,7 @@ def format_reads(ArrayList sheet_data){
     def error_occured = false
     meta.id = sheet_data[0] // id is first value
     meta.sample = sheet_data[1].external_id
-    meta.external_id = sheet_data[1].external_id
+    meta.external_id = sheet_data[0]
 
     meta.hybrid = false
     meta.assembly = false
diff --git a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
index 0a1d49a8..b3227593 100644
--- a/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
+++ b/tests/data/samplesheets/samplesheet-test-from-assemblies-vibrio-stupid-names.csv
@@ -1,2 +1,2 @@
-sample_name,sample,fastq_1,fastq_2,long_reads,assembly
-INX,iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
+sample,sample_name,fastq_1,fastq_2,long_reads,assembly
+INX,.iridanext_output.,,,,https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/fake_contigs/vibrio_cholerae/st_120.fa.gz
diff --git a/tests/subworkflows/local/input_check/input_check.nf.test b/tests/subworkflows/local/input_check/input_check.nf.test
index 6eb54be9..df362557 100644
--- a/tests/subworkflows/local/input_check/input_check.nf.test
+++ b/tests/subworkflows/local/input_check/input_check.nf.test
@@ -112,7 +112,7 @@ nextflow_workflow {
             assert workflow.success
             assert workflow.out.reads[0][0].id == 'an_even_stronger_name_'
             assert workflow.out.reads[0][0].merge == false
-            assert workflow.out.reads[1][0].id == 'better_faster_stronger_name'
+            assert workflow.out.reads[1][0].id == 'better.faster.stronger.name'
             assert workflow.out.reads[1][0].merge == false
             assert workflow.out.reads[2][0].id == 'this_is_getting_ridiculous'
             assert workflow.out.reads[2][0].merge == true

From 6ba57b0d613792c00db142e71d996de82de7e611 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 12 Nov 2024 10:13:12 -0600
Subject: [PATCH 29/32] fixed stupid name issue report keys not found

---
 bin/report_summaries.py           |  5 ++---
 modules/local/report.nf           | 14 ++++++--------
 subworkflows/local/input_check.nf |  3 +--
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/bin/report_summaries.py b/bin/report_summaries.py
index 73b17617..17ca74e3 100755
--- a/bin/report_summaries.py
+++ b/bin/report_summaries.py
@@ -264,11 +264,10 @@ def output_indv_json(self, flattened_data):
         """
         updated_items = dict()
         for k, v in flattened_data.items():
-            #out_path = os.path.join(self.output_dir, k + self.flat_sample_string)
             out_key = k
             sample_dir = k
-            if v.get(self.__inx_irida_key) != k:
-                sample_dir = v[self.__inx_irida_key]
+            if dir_name := v.get(self.__inx_irida_key) != k:
+                sample_dir = dir_name
                 #! this field affects the identification of the irida next id being passed out of the pipeline
                 out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field
 
diff --git a/modules/local/report.nf b/modules/local/report.nf
index f361a0f2..bd14fce7 100644
--- a/modules/local/report.nf
+++ b/modules/local/report.nf
@@ -41,6 +41,7 @@ process REPORT{
         def report_tag = test_in[i+1]
         def report_value = test_in[i+2]
 
+        println meta_data
         if(!sample_data.containsKey(meta_data.sample)){
             // Todo issue grabbing correct tag is here
             sample_data[meta_data.sample] = [:]
@@ -48,18 +49,16 @@ process REPORT{
         }
 
         update_map_values(sample_data, meta_data, "metagenomic")
-        //update_map_values(sample_data, meta_data, "sample")
-        //update_map_values(sample_data, meta_data, "external_id")
-        //update_map_values(sample_data, meta_data, "id")
+        update_map_values(sample_data, meta_data, "sample")
+        update_map_values(sample_data, meta_data, "external_id")
         update_map_values(sample_data, meta_data, "assembly")
         update_map_values(sample_data, meta_data, "hybrid")
         update_map_values(sample_data, meta_data, "single_end")
         update_map_values(sample_data, meta_data, "merge")
         update_map_values(sample_data, meta_data, "downsampled")
 
-
-        if(!sample_data[meta_data.sample].containsKey(meta_data.id)){
-            sample_data[meta_data.sample][meta_data.id] = [:]
+        if(!sample_data[meta_data.sample].containsKey(meta_data.external_id)){
+            sample_data[meta_data.sample][meta_data.external_id] = [:]
         }
 
         if(report_value instanceof Path){
@@ -67,14 +66,13 @@ process REPORT{
             if(!check_file_params(report_tag, extension)){
                 continue
             }
-            // TODO pass in report metadata
             def output_data = parse_data(report_value, extension, report_tag, headers_list)
             if(output_data){
                 report_value = output_data
             }
         }
 
-        sample_data[meta_data.sample][meta_data.id][report_tag.report_tag] = report_value
+        sample_data[meta_data.sample][meta_data.external_id][report_tag.report_tag] = report_value
     }
 
 
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index edbcb5f0..8fd70a64 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -47,7 +47,6 @@ workflow INPUT_CHECK {
                         meta[0].id = "${meta[0].id}_${meta[0].external_id}"
                     }
                 }
-                println "${meta[0].id} ${meta[0]}"
                 processedIDs << meta[0].id
                 tuple(meta[0].id, meta[0])
         }
@@ -135,7 +134,7 @@ def format_reads(ArrayList sheet_data){
     def error_occured = false
     meta.id = sheet_data[0] // id is first value
     meta.sample = sheet_data[1].external_id
-    meta.external_id = sheet_data[0]
+    meta.external_id = sheet_data[1].external_id
 
     meta.hybrid = false
     meta.assembly = false

From a2c56a8b1f67cd98c0d3dc8602f4da5daa38652d Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 12 Nov 2024 11:59:35 -0600
Subject: [PATCH 30/32] fixed failig test case

---
 modules/local/report.nf |  4 --
 tests/main.nf.test      | 84 ++++++++++++++++++++---------------------
 2 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/modules/local/report.nf b/modules/local/report.nf
index bd14fce7..5b131278 100644
--- a/modules/local/report.nf
+++ b/modules/local/report.nf
@@ -41,9 +41,7 @@ process REPORT{
         def report_tag = test_in[i+1]
         def report_value = test_in[i+2]
 
-        println meta_data
         if(!sample_data.containsKey(meta_data.sample)){
-            // Todo issue grabbing correct tag is here
             sample_data[meta_data.sample] = [:]
             sample_data[meta_data.sample]["meta"] = [:]
         }
@@ -680,8 +678,6 @@ def generate_qc_data(data, search_phrases, qc_species_tag){
     def species_tag_location = 0
     for(k in data){
         if(!k.value.meta.metagenomic){
-            println k.value
-            println k.key
             def species = get_species(k.value[k.key][top_hit_tag], search_phrases, shortest_token)
             // update coverage first so its values can be used in generating qc messages
             generate_coverage_data(data[k.key], params.coverage_calc_fields.bp_field, species)
diff --git a/tests/main.nf.test b/tests/main.nf.test
index ef559da2..b3fde7cd 100644
--- a/tests/main.nf.test
+++ b/tests/main.nf.test
@@ -155,45 +155,45 @@ nextflow_pipeline {
             // parse output json file
             def json = path("$launchDir/results/FinalReports/Aggregated/Json/final_report.json").json
 
-            assert json.short.short.RawReadSummary.R1."total_bp".equals(118750)
-            assert json.short.short.RawReadSummary.R1."total_reads".equals(475)
-            assert json.short.short.RawReadSummary.R1."read_qual_mean".equals(40.0)
-            assert json.short.short.RawReadSummary.R1."mean_sequence_length".equals(250.0)
-
-            assert json.short.short.FastP.summary.sequencing.equals("paired end (250 cycles + 250 cycles)")
-            assert json.short.short.FastP.summary.before_filtering.total_reads.equals(950)
-            assert json.short.short.FastP.filtering_result.passed_filter_reads.equals(950)
-            assert json.short.short.FastP.filtering_result.low_quality_reads.equals(0)
-            assert json.short.short.FastP.insert_size.peak.equals(347)
-
-            //assert json.short.meta.metagenomic.equals(false)  // Currently, this is "null".
-            assert json.short.meta.assembly.equals(false)
-            assert json.short.meta.hybrid.equals(false)
-            assert json.short.meta.single_end.equals(false)
-            assert json.short.meta.merge.equals(false)
-            assert json.short.meta.downsampled.equals(false)
-
-            assert json.short.short.AssemblyCompleted.equals(true)
-            assert json.short.short.QUAST."0"."Total length (>= 0 bp)".equals("4949")
-            assert json.short.short.QUAST."0"."Largest contig".equals("4949")
-            assert json.short.short.QUAST."0"."GC (%)".equals("52.96")
-            assert json.short.short.QUAST."0"."Avg. coverage depth".equals("47")
+            assert json.INX.INX.RawReadSummary.R1."total_bp".equals(118750)
+            assert json.INX.INX.RawReadSummary.R1."total_reads".equals(475)
+            assert json.INX.INX.RawReadSummary.R1."read_qual_mean".equals(40.0)
+            assert json.INX.INX.RawReadSummary.R1."mean_sequence_length".equals(250.0)
+
+            assert json.INX.INX.FastP.summary.sequencing.equals("paired end (250 cycles + 250 cycles)")
+            assert json.INX.INX.FastP.summary.before_filtering.total_reads.equals(950)
+            assert json.INX.INX.FastP.filtering_result.passed_filter_reads.equals(950)
+            assert json.INX.INX.FastP.filtering_result.low_quality_reads.equals(0)
+            assert json.INX.INX.FastP.insert_size.peak.equals(347)
+
+            //assert json.INX.meta.metagenomic.equals(false)  // Currently, this is "null".
+            assert json.INX.meta.assembly.equals(false)
+            assert json.INX.meta.hybrid.equals(false)
+            assert json.INX.meta.single_end.equals(false)
+            assert json.INX.meta.merge.equals(false)
+            assert json.INX.meta.downsampled.equals(false)
+
+            assert json.INX.INX.AssemblyCompleted.equals(true)
+            assert json.INX.INX.QUAST."0"."Total length (>= 0 bp)".equals("4949")
+            assert json.INX.INX.QUAST."0"."Largest contig".equals("4949")
+            assert json.INX.INX.QUAST."0"."GC (%)".equals("52.96")
+            assert json.INX.INX.QUAST."0"."Avg. coverage depth".equals("47")
 
             // Below two values should be empty
-            assert json.short.short.StarAMR."0"."Genotype".equals("None")
-            assert json.short.short.StarAMR."0"."Predicted Phenotype".equals("Susceptible")
-            assert json.short.short.StarAMR."0"."Genome Length".equals("4949")
+            assert json.INX.INX.StarAMR."0"."Genotype".equals("None")
+            assert json.INX.INX.StarAMR."0"."Predicted Phenotype".equals("Susceptible")
+            assert json.INX.INX.StarAMR."0"."Genome Length".equals("4949")
 
-            assert json.short.short.CheckM."0"."# genomes".equals("5656")
-            assert json.short.short.CheckM."0"."# markers".equals("56")
-            assert json.short.short.CheckM."0"."# marker sets".equals("24")
-            assert json.short.short.CheckM."0".Contamination.equals("0.00")
+            assert json.INX.INX.CheckM."0"."# genomes".equals("5656")
+            assert json.INX.INX.CheckM."0"."# markers".equals("56")
+            assert json.INX.INX.CheckM."0"."# marker sets".equals("24")
+            assert json.INX.INX.CheckM."0".Contamination.equals("0.00")
 
-            assert json.short.short.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz")
+            assert json.INX.INX.SevenGeneMLSTReport[0].filename.equals("short.filtered.fasta.gz")
 
-            assert json.short.short.Abricate."0".RESISTANCE.equals("NoData")  // All Abricate results for this are "NoData".
+            assert json.INX.INX.Abricate."0".RESISTANCE.equals("NoData")  // All Abricate results for this are "NoData".
 
-            def assembly_path = "$launchDir/results/Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz"
+            def assembly_path = "$launchDir/results/Assembly/FinalAssembly/INX/short.final.filtered.assembly.fasta.gz"
             assert path(assembly_path).exists()
 
             // parse assembly file
@@ -209,17 +209,17 @@ nextflow_pipeline {
             // output files
             assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Json/final_report.json" }.size() == 1
             assert iridanext_global.findAll { it.path == "FinalReports/Aggregated/Tables/final_report.tsv" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "Assembly/FinalAssembly/short/short.final.filtered.assembly.fasta.gz" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
-            assert iridanext_samples.short.findAll { it.path == "FinalReports/FlattenedReports/short.flat_sample.json.gz" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/FinalAssembly/INX/short.final.filtered.assembly.fasta.gz" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/QUAST/short/short.transposed_short.quast.quality.tsv" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/Quality/SeqKitStats/short.seqkit.stats.summary.tsv" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Assembly/PostProcessing/Speciation/MashScreen/short.mash.screen.taxa.screen.screen" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/MashScreen/short.mash.screen.reads.screen.screen" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/Trimmed/FastP/short.fastp.summary.json" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "Reads/Quality/RawReadQuality/short.read.scan.summary.json" }.size() == 1
+            assert iridanext_samples.INX.findAll { it.path == "FinalReports/FlattenedReports/INX.flat_sample.json.gz" }.size() == 1
 
             // output metadata
-            assert iridanext_metadata.short."QC Status" == "PASSED"
+            assert iridanext_metadata.INX."QC Status" == "PASSED"
 
         }
 

From a1c3f3eb3b8cea250111e807940c213fd07959fc Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 12 Nov 2024 14:14:35 -0600
Subject: [PATCH 31/32] updated changelog

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 62f2a563..3b3d7fd1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
 
-- Sample names (`sample_name` field) can no longer begin with a period. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)
 
 ### `Updated`
 

From 899e35b949045ee31c4307dbb6bcc3ac272e33f9 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Tue, 12 Nov 2024 14:15:58 -0600
Subject: [PATCH 32/32] updated changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3b3d7fd1..90dbd1bc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Changed`
 
-- Added a `sample_name` field, `sample` still exists but is used for different purposes [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
+- Added a `sample_name` field, `sample` still exists but is used to incorporate additional names/identifiers in IRIDANext [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)
 
 - RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)