diff --git a/CHANGES.md b/CHANGES.md index 0c2b1a45..ab30de1a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,14 +2,20 @@ ## development version +* #476: Significantly improve speed and accuracy by enabling by default a new + variant of multi-context seeds: When no regular seeds - which consist + of two strobes - can be found for the entire query, strobealign attempts to find + single-strobe ("partial") seeds. + The `--mcs` option is still available for now. It is a bit slower, but + slightly more accurate. * #468: Be less strict when checking reference sequence names. ## v0.15.0 (2024-12-13) * #388 and #426: Increase accuracy and mapping rate for reads shorter than about 200 bp by introducing multi-context seeds. - Previously, seeds always consisted of two k-mers and would only be found if - both occur in query and reference. + Previously, seeds always consisted of two k-mers ("strobes") and would only + be found if both occur in query and reference. With this change, strobealign falls back to looking up just one of the k-mers when appropriate. This feature is currently *experimental* and only enabled when using the diff --git a/README.md b/README.md index 1cf7350a..487ec9a6 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,29 @@ actual mapping: - Index files are about four times as large as the reference. +## Explanation + +### Multi-context seeds + +Strobealign uses randstrobes as seeds, which in our case consist of two k-mers +("strobes") that are somewhat close to each other. When a seed is looked up +in the index, it is only found if both strobes match. By changing the way in +which the index is stored in v0.15.0, it became possible to support +*multi-context seeds*. With those changes, strobealign falls back to looking +up only one of the strobes (a "partial seed") if the full seed cannot be found. +This results in better mapping rate and accuracy for read lengths of up to +about 200 nt. + +Usage of multi-context seeds is enabled by default in strobealign since v0.16.0. +The strategy is to first search for all full seeds of the query and fall back to +partial seeds if *no* seeds could be found. + +A slightly more accurate, but slower mode of using multi-context seeds is +available by using option `--mcs`: With it, the strategy is changed to a +fallback *per seed*: If an individual full seed cannot be found, its partial +version is looked up in the index. + + ## Changelog See [Changelog](CHANGES.md). diff --git a/src/cmdline.cpp b/src/cmdline.cpp index a4f2c068..97eb82b8 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -54,7 +54,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { args::ValueFlag end_bonus(parser, "INT", "Soft clipping penalty [10]", {'L'}); args::Group search(parser, "Search parameters:"); - args::Flag mcs(parser, "mcs", "Use multi-context seeds for finding hits", {"mcs"}); + args::Flag mcs(parser, "mcs", "Use extended multi-context seed mode for finding hits. Slightly more accurate, but slower", {"mcs"}); args::ValueFlag f(parser, "FLOAT", "Top fraction of repetitive strobemers to filter out from sampling [0.0002]", {'f'}); args::ValueFlag S(parser, "FLOAT", "Try candidate sites with mapping score at least S of maximum mapping score [0.5]", {'S'}); args::ValueFlag M(parser, "INT", "Maximum number of mapping sites to try [20]", {'M'}); diff --git a/src/nam.cpp b/src/nam.cpp index f07114e2..fbd81e94 100644 --- a/src/nam.cpp +++ b/src/nam.cpp @@ -219,6 +219,21 @@ std::tuple> find_nams( } } + // Rescue using partial hits, even in non-MCS mode + if (total_hits == 0 && !use_mcs) { + for (const auto &q : query_randstrobes) { + size_t partial_pos = index.find_partial(q.hash); + if (partial_pos != index.end()) { + total_hits++; + if (index.is_partial_filtered(partial_pos)) { + continue; + } + nr_good_hits++; + add_to_matches_map_partial(matches_map[q.is_reverse], q.start, q.start + index.k(), index, partial_pos); + } + } + } + float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0; auto nams = merge_matches_into_nams_forward_and_reverse(matches_map, index.k(), use_mcs); return {nonrepetitive_fraction, nr_good_hits, nams}; @@ -249,43 +264,34 @@ std::pair> find_nams_rescue( } }; std::array>, 2> matches_map; - std::vector hits_fw; - std::vector hits_rc; + std::array, 2> hits; matches_map[0].reserve(100); matches_map[1].reserve(100); - hits_fw.reserve(5000); - hits_rc.reserve(5000); + hits[0].reserve(5000); + hits[1].reserve(5000); for (auto &qr : query_randstrobes) { size_t position = index.find_full(qr.hash); if (position != index.end()) { unsigned int count = index.get_count_full(position); RescueHit rh{position, count, qr.start, qr.end, false}; - if (qr.is_reverse){ - hits_rc.push_back(rh); - } else { - hits_fw.push_back(rh); - } + hits[qr.is_reverse].push_back(rh); } else if (use_mcs) { size_t partial_pos = index.find_partial(qr.hash); if (partial_pos != index.end()) { unsigned int partial_count = index.get_count_partial(partial_pos); RescueHit rh{partial_pos, partial_count, qr.start, qr.start + index.k(), true}; - if (qr.is_reverse){ - hits_rc.push_back(rh); - } else { - hits_fw.push_back(rh); - } + hits[qr.is_reverse].push_back(rh); } } } - std::sort(hits_fw.begin(), hits_fw.end()); - std::sort(hits_rc.begin(), hits_rc.end()); + std::sort(hits[0].begin(), hits[0].end()); + std::sort(hits[1].begin(), hits[1].end()); int n_hits = 0; size_t is_revcomp = 0; - for (auto& rescue_hits : {hits_fw, hits_rc}) { + for (auto& rescue_hits : hits) { int cnt = 0; for (auto &rh : rescue_hits) { if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) { diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index d1973b3e..9b467e40 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -7f5ac330f68afbe32e0cde4a32bbb786cdd4d01e +acc4cffe5ac2c4db266c58d00b7b6462c6b4189c