From efaafea784ccf01a40114548d45a570deb984247 Mon Sep 17 00:00:00 2001 From: Richard Harris Date: Fri, 13 Sep 2024 17:14:48 +0100 Subject: [PATCH 1/2] add new inosine_m6A and m5C mods, new RNA v5.1.0 canonical and mods models, and DNA v5.0.0 mods@v2 models --- dorado/models/metadata.cpp | 20 ++++-- dorado/models/metadata.h | 3 + dorado/models/models.cpp | 126 +++++++++++++++++++++++++++++++++++- tests/ModelMetadataTest.cpp | 12 +++- 4 files changed, 152 insertions(+), 9 deletions(-) diff --git a/dorado/models/metadata.cpp b/dorado/models/metadata.cpp index 412ffce21..6d83ab58d 100644 --- a/dorado/models/metadata.cpp +++ b/dorado/models/metadata.cpp @@ -20,17 +20,24 @@ const std::unordered_map map = { namespace mods { const std::unordered_map map = { - {"4mC_5mC", ModsVariant::M_4mC_5mC}, {"5mC_5hmC", ModsVariant::M_5mC_5hmC}, - {"5mCG", ModsVariant::M_5mCG}, {"5mCG_5hmCG", ModsVariant::M_5mCG_5hmCG}, - {"5mC", ModsVariant::M_5mC}, {"6mA", ModsVariant::M_6mA}, - {"m6A", ModsVariant::M_m6A}, {"m6A_DRACH", ModsVariant::M_m6A_DRACH}, + {"4mC_5mC", ModsVariant::M_4mC_5mC}, + {"5mC_5hmC", ModsVariant::M_5mC_5hmC}, + {"5mCG", ModsVariant::M_5mCG}, + {"5mCG_5hmCG", ModsVariant::M_5mCG_5hmCG}, + {"5mC", ModsVariant::M_5mC}, + {"m5C", ModsVariant::M_m5C}, + {"6mA", ModsVariant::M_6mA}, + {"m6A", ModsVariant::M_m6A}, + {"m6A_DRACH", ModsVariant::M_m6A_DRACH}, + {"inosine_m6A", ModsVariant::M_inosine_m6A}, {"pseU", ModsVariant::M_pseU}, }; const std::unordered_map canonical_base_map = { {ModsVariant::M_4mC_5mC, "C"}, {ModsVariant::M_5mC_5hmC, "C"}, {ModsVariant::M_5mCG, "C"}, {ModsVariant::M_5mCG_5hmCG, "C"}, - {ModsVariant::M_5mC, "C"}, {ModsVariant::M_6mA, "A"}, + {ModsVariant::M_5mC, "C"}, {ModsVariant::M_m5C, "C"}, + {ModsVariant::M_6mA, "A"}, {ModsVariant::M_inosine_m6A, "A"}, {ModsVariant::M_m6A, "A"}, {ModsVariant::M_m6A_DRACH, "A"}, {ModsVariant::M_pseU, "T"}, }; @@ -48,7 +55,8 @@ const std::unordered_map map = { {"v3.5.2", ModelVersion::v3_5_2}, {"v3.6.0", ModelVersion::v3_6_0}, {"v4.0.0", ModelVersion::v4_0_0}, {"v4.1.0", ModelVersion::v4_1_0}, {"v4.2.0", ModelVersion::v4_2_0}, {"v4.3.0", ModelVersion::v4_3_0}, - {"v5.0.0", ModelVersion::v5_0_0}, {"latest", ModelVersion::NONE}}; + {"v5.0.0", ModelVersion::v5_0_0}, {"v5.1.0", ModelVersion::v5_1_0}, + {"latest", ModelVersion::NONE}}; } // namespace version const std::unordered_map& model_variants_map() { diff --git a/dorado/models/metadata.h b/dorado/models/metadata.h index 4a8a8cca3..22c76b3c9 100644 --- a/dorado/models/metadata.h +++ b/dorado/models/metadata.h @@ -25,9 +25,11 @@ enum class ModsVariant : uint8_t { M_5mCG, M_5mCG_5hmCG, M_5mC, + M_m5C, M_6mA, M_m6A, M_m6A_DRACH, + M_inosine_m6A, M_pseU, NONE // NONE must be last }; @@ -53,6 +55,7 @@ enum class ModelVersion : uint8_t { v4_2_0, v4_3_0, v5_0_0, + v5_1_0, NONE // NONE must be last }; diff --git a/dorado/models/models.cpp b/dorado/models/models.cpp index 772d2a859..fecfcac7a 100644 --- a/dorado/models/models.cpp +++ b/dorado/models/models.cpp @@ -408,7 +408,7 @@ const std::vector models = { ModelVariantPair{ModelVariant::HAC, VV::v3_0_0, true}, }, - // RNA004 + // RNA004 v3.0.1 ModelInfo{ "rna004_130bps_fast@v3.0.1", "2afa5de03f28162dd85b7be4a2dda108be7cc0a19062db7cb8460628aac462c0", @@ -427,6 +427,7 @@ const std::vector models = { CC::RNA004_130BPS, ModelVariantPair{ModelVariant::SUP, VV::v3_0_1}, }, + // RNA v5.0.0 ModelInfo{ "rna004_130bps_fast@v5.0.0", "3b45ecedf2e20c56e15033402deb77f3c4e67df49aea8d7b76acdbb4029e8ea0", @@ -445,6 +446,25 @@ const std::vector models = { CC::RNA004_130BPS, ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, }, + // RNA v5.1.0 + ModelInfo{ + "rna004_130bps_fast@v5.1.0", + "c01353ac8362479ceedf607c41e5f238efd629725556d896161baa194b7354be", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::FAST, VV::v5_1_0}, + }, + ModelInfo{ + "rna004_130bps_hac@v5.1.0", + "36ac8bdb2baaf32e697086962078f83a001a3ffe1461e358fabef15c08b15c5e", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::HAC, VV::v5_1_0, true}, + }, + ModelInfo{ + "rna004_130bps_sup@v5.1.0", + "ab7c5687f149901868898791b8d243c28e8345c9b61e3abce30d63e112ebc3b1", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::SUP, VV::v5_1_0}, + }, }; } // namespace simplex @@ -801,6 +821,20 @@ const std::vector models = { ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, ModsVariantPair{ModsVariant::M_4mC_5mC, VV::v1_0_0}, }, + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_hac@v5.0.0_4mC_5mC@v2", + "d7c4ee43e954b081a0179e5236245a62094fcecb1454de2b3901f2b10d8807d7", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::HAC, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_4mC_5mC, VV::v2_0_0}, + }, + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_sup@v5.0.0_4mC_5mC@v2", + "eb971340e111ebfdb27bd2b70390c5f0252ba91f5ac92eea0dbd59524bac68f7", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_4mC_5mC, VV::v2_0_0}, + }, // 5mC+5hmC all-context HAC and SUP ModelInfo{ "dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mC_5hmC@v1", @@ -816,6 +850,20 @@ const std::vector models = { ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, ModsVariantPair{ModsVariant::M_5mC_5hmC, VV::v1_0_0}, }, + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mC_5hmC@v2", + "8bde4f0fd27a2e2fbf98942a6e1cc1d4547c6678a69940a2152a6a5cdb98cc3c", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::HAC, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_5mC_5hmC, VV::v2_0_0}, + }, + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_sup@v5.0.0_5mC_5hmC@v2", + "36de2e58edaf1e1a53bca0ebf029164112b9dbaad413672dde45efb093b7fcf6", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_5mC_5hmC, VV::v2_0_0}, + }, // 5mC+5hmC CG-context HAC and SUP ModelInfo{ "dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG@v1", @@ -831,6 +879,20 @@ const std::vector models = { ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, ModsVariantPair{ModsVariant::M_5mCG_5hmCG, VV::v1_0_0}, }, + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG@v2", + "5c2452e4ccd443e7f6549afb6ac732b03b90480801e2a29850e5616185cb6d5b", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::HAC, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_5mCG_5hmCG, VV::v2_0_0}, + }, + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_sup@v5.0.0_5mCG_5hmCG@v2", + "2e2d4de2ec1df90b37c50b3367bea90f7b9dfab11b90e98ee6963876589be4cc", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_5mCG_5hmCG, VV::v2_0_0}, + }, // 6mA all-context HAC and SUP ModelInfo{ "dna_r10.4.1_e8.2_400bps_hac@v5.0.0_6mA@v1", @@ -846,7 +908,20 @@ const std::vector models = { ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, ModsVariantPair{ModsVariant::M_6mA, VV::v1_0_0}, }, - + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_hac@v5.0.0_6mA@v2", + "919aaf7fdfbf50a1fe20124e07014fa2b38cc10f3dadb27c56b415309147eee9", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::HAC, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_6mA, VV::v2_0_0}, + }, + ModelInfo{ + "dna_r10.4.1_e8.2_400bps_sup@v5.0.0_6mA@v2", + "fc1d247475162d4f782d66bb3cd6f19c76e5589a8e064f738de4896f940568b3", + CC::DNA_R10_4_1_E8_2_400BPS_5KHZ, + ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_6mA, VV::v2_0_0}, + }, // RNA004 v3.0.1 ModelInfo{ "rna004_130bps_sup@v3.0.1_m6A_DRACH@v1", @@ -885,6 +960,53 @@ const std::vector models = { CC::RNA004_130BPS, ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, ModsVariantPair{ModsVariant::M_pseU, VV::v1_0_0}, + }, + + // RNA004 v5.1.0 + // m5C - all context + ModelInfo{ + "rna004_130bps_hac@v5.1.0_m5C@v1", + "d9c142ba65c15cebaf42ea44a3e5731bc3d59f89a2b07e55701f7152bde2937e", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::HAC, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_m5C, VV::v1_0_0}, + }, + ModelInfo{ + "rna004_130bps_sup@v5.1.0_m5C@v1", + "073a9a66a613f61fca83447816c4fd95ce608c854b54540e2a9f82b4c1498a3a", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::SUP, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_m5C, VV::v1_0_0}, + }, + // inosine_m6A - all context + ModelInfo{ + "rna004_130bps_hac@v5.1.0_inosine_m6A@v1", + "e709c9ce7e256f8d2bb259a0ab22d2bddc60c61834d3a020e2c8fc5721c5d548", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::HAC, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_inosine_m6A, VV::v1_0_0}, + }, + ModelInfo{ + "rna004_130bps_sup@v5.1.0_inosine_m6A@v1", + "8bcbd48f9f01eb624a8fdcb928c204b915ed002c1ddc600dfa3c2be16879b7df", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::SUP, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_inosine_m6A, VV::v1_0_0}, + }, + // pseU - all context + ModelInfo{ + "rna004_130bps_hac@v5.1.0_pseU@v1", + "5d7c3cf12736baaba987c2ca899abd89193e859edfc7b9aad82a00e4bbc2e6bd", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::HAC, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_pseU, VV::v1_0_0}, + }, + ModelInfo{ + "rna004_130bps_sup@v5.1.0_pseU@v1", + "02049be4f690cdf4a1200f6077b657c43587d1be2816fab01bb3f02f06e2cb7c", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::SUP, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_pseU, VV::v1_0_0}, }}; } // namespace modified diff --git a/tests/ModelMetadataTest.cpp b/tests/ModelMetadataTest.cpp index db9cef2a7..2331490b6 100644 --- a/tests/ModelMetadataTest.cpp +++ b/tests/ModelMetadataTest.cpp @@ -57,9 +57,11 @@ TEST_CASE(TEST_TAG " ModsVariant enumeration", TEST_TAG) { CHECK(mods.at("5mCG") == ModsVariant::M_5mCG); CHECK(mods.at("5mCG_5hmCG") == ModsVariant::M_5mCG_5hmCG); CHECK(mods.at("5mC") == ModsVariant::M_5mC); + CHECK(mods.at("m5C") == ModsVariant::M_m5C); CHECK(mods.at("6mA") == ModsVariant::M_6mA); CHECK(mods.at("m6A") == ModsVariant::M_m6A); CHECK(mods.at("m6A_DRACH") == ModsVariant::M_m6A_DRACH); + CHECK(mods.at("inosine_m6A") == ModsVariant::M_inosine_m6A); CHECK(mods.at("pseU") == ModsVariant::M_pseU); CHECK(mods.size() == static_cast(ModsVariant::NONE)); } @@ -70,9 +72,11 @@ TEST_CASE(TEST_TAG " ModsVariant enumeration", TEST_TAG) { CHECK(get_mods_variant("5mCG") == ModsVariant::M_5mCG); CHECK(get_mods_variant("5mCG_5hmCG") == ModsVariant::M_5mCG_5hmCG); CHECK(get_mods_variant("5mC") == ModsVariant::M_5mC); + CHECK(get_mods_variant("m5C") == ModsVariant::M_m5C); CHECK(get_mods_variant("6mA") == ModsVariant::M_6mA); CHECK(get_mods_variant("m6A") == ModsVariant::M_m6A); CHECK(get_mods_variant("m6A_DRACH") == ModsVariant::M_m6A_DRACH); + CHECK(get_mods_variant("inosine_m6A") == ModsVariant::M_inosine_m6A); CHECK(get_mods_variant("pseU") == ModsVariant::M_pseU); for (const auto& it : {"", "foo", "dna_r10.4.1_e8.2_400bps_sup@v4.2.0_5mC@v2"}) { CHECK(get_mods_variant(it) == ModsVariant::NONE); @@ -85,9 +89,11 @@ TEST_CASE(TEST_TAG " ModsVariant enumeration", TEST_TAG) { CHECK(to_string(ModsVariant::M_5mCG) == "5mCG"); CHECK(to_string(ModsVariant::M_5mCG_5hmCG) == "5mCG_5hmCG"); CHECK(to_string(ModsVariant::M_5mC) == "5mC"); + CHECK(to_string(ModsVariant::M_m5C) == "m5C"); CHECK(to_string(ModsVariant::M_6mA) == "6mA"); - CHECK(to_string(ModsVariant::M_m6A_DRACH) == "m6A_DRACH"); CHECK(to_string(ModsVariant::M_m6A) == "m6A"); + CHECK(to_string(ModsVariant::M_m6A_DRACH) == "m6A_DRACH"); + CHECK(to_string(ModsVariant::M_inosine_m6A) == "inosine_m6A"); CHECK(to_string(ModsVariant::M_pseU) == "pseU"); CHECK_THROWS_AS(to_string(ModsVariant::NONE), std::logic_error); } @@ -116,7 +122,9 @@ TEST_CASE(TEST_TAG " mods_canonical_base_map", TEST_TAG) { CHECK(mods.at(ModsVariant::M_5mCG) == "C"); CHECK(mods.at(ModsVariant::M_5mCG_5hmCG) == "C"); CHECK(mods.at(ModsVariant::M_5mC) == "C"); + CHECK(mods.at(ModsVariant::M_m5C) == "C"); CHECK(mods.at(ModsVariant::M_6mA) == "A"); + CHECK(mods.at(ModsVariant::M_inosine_m6A) == "A"); CHECK(mods.at(ModsVariant::M_m6A) == "A"); CHECK(mods.at(ModsVariant::M_m6A_DRACH) == "A"); CHECK(mods.at(ModsVariant::M_pseU) == "T"); @@ -146,6 +154,8 @@ TEST_CASE(TEST_TAG " ModelVersion enumeration", TEST_TAG) { CHECK(to_string(ModelVersion::v4_1_0) == "v4.1.0"); CHECK(to_string(ModelVersion::v4_2_0) == "v4.2.0"); CHECK(to_string(ModelVersion::v4_3_0) == "v4.3.0"); + CHECK(to_string(ModelVersion::v5_0_0) == "v5.0.0"); + CHECK(to_string(ModelVersion::v5_1_0) == "v5.1.0"); CHECK(to_string(ModelVersion::NONE) == "latest"); CHECK(vers.size() == static_cast(ModelVersion::NONE) + 1); // +1 as "NONE" is included in the map From 42d98d809490eca50c7ddfbce828fb4edaf81ab9 Mon Sep 17 00:00:00 2001 From: David Newman Date: Mon, 16 Sep 2024 10:42:28 +0100 Subject: [PATCH 2/2] Mod-models: added RNA m6A-DRACH models for basecall models v5.0 and v5.1 --- dorado/models/models.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/dorado/models/models.cpp b/dorado/models/models.cpp index fecfcac7a..fed9a87c4 100644 --- a/dorado/models/models.cpp +++ b/dorado/models/models.cpp @@ -923,6 +923,7 @@ const std::vector models = { ModsVariantPair{ModsVariant::M_6mA, VV::v2_0_0}, }, // RNA004 v3.0.1 + // m6A - DRACH ModelInfo{ "rna004_130bps_sup@v3.0.1_m6A_DRACH@v1", "356b3eed19916d83d59cbfd24bb9f33823d6f738891f3ac8fe77319ae5cbde7f", @@ -946,6 +947,21 @@ const std::vector models = { ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, ModsVariantPair{ModsVariant::M_m6A, VV::v1_0_0}, }, + // m6A - DRACH + ModelInfo{ + "rna004_130bps_hac@v5.0.0_m6A_DRACH@v1", + "b140acbfc04bb24080b39cc81d71016895dc74454c7cb630629b93ec60e315c9", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::HAC, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_m6A_DRACH, VV::v1_0_0}, + }, + ModelInfo{ + "rna004_130bps_sup@v5.0.0_m6A_DRACH@v1", + "62dd2d9e225fa9638258bd33063fa930c4179b13878064547d5be7b33d478b23", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::SUP, VV::v5_0_0}, + ModsVariantPair{ModsVariant::M_m6A_DRACH, VV::v1_0_0}, + }, // pseU - all context ModelInfo{ "rna004_130bps_hac@v5.0.0_pseU@v1", @@ -993,6 +1009,21 @@ const std::vector models = { ModelVariantPair{ModelVariant::SUP, VV::v5_1_0}, ModsVariantPair{ModsVariant::M_inosine_m6A, VV::v1_0_0}, }, + // m6A - DRACH + ModelInfo{ + "rna004_130bps_hac@v5.1.0_m6A_DRACH@v1", + "911ba609b657f8e24fe44519a965d0d9bac91f35e7026c8ee1614492bf7ce3f9", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::HAC, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_m6A_DRACH, VV::v1_0_0}, + }, + ModelInfo{ + "rna004_130bps_sup@v5.1.0_m6A_DRACH@v1", + "ec616e5d725860e1686c17d70c8f135c6e0e66f6c3e7e28a6cdefe19cae2e91f", + CC::RNA004_130BPS, + ModelVariantPair{ModelVariant::SUP, VV::v5_1_0}, + ModsVariantPair{ModsVariant::M_m6A_DRACH, VV::v1_0_0}, + }, // pseU - all context ModelInfo{ "rna004_130bps_hac@v5.1.0_pseU@v1",