Merge pull request #476 from ksahlin/mcs-rescue

Rescue using partial hits, even in non-MCS mode
ksahlin · Jan 20, 2025 · bfce78a · bfce78a
2 parents b2105c6 + affcebf
commit bfce78a
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 21 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,14 +2,20 @@
 
 ## development version
 
+* #476: Significantly improve speed and accuracy by enabling by default a new
+  variant of multi-context seeds: When no regular seeds - which consist
+  of two strobes - can be found for the entire query, strobealign attempts to find
+  single-strobe ("partial") seeds.
+  The `--mcs` option is still available for now. It is a bit slower, but
+  slightly more accurate.
 * #468: Be less strict when checking reference sequence names.
 
 ## v0.15.0 (2024-12-13)
 
 * #388 and #426: Increase accuracy and mapping rate for reads shorter than
   about 200 bp by introducing multi-context seeds.
-  Previously, seeds always consisted of two k-mers and would only be found if
-  both occur in query and reference.
+  Previously, seeds always consisted of two k-mers ("strobes") and would only
+  be found if both occur in query and reference.
   With this change, strobealign falls back to looking up just one of the k-mers
   when appropriate.
   This feature is currently *experimental* and only enabled when using the

diff --git a/README.md b/README.md
@@ -223,6 +223,29 @@ actual mapping:
 - Index files are about four times as large as the reference.
 
 
+## Explanation
+
+### Multi-context seeds
+
+Strobealign uses randstrobes as seeds, which in our case consist of two k-mers
+("strobes") that are somewhat close to each other. When a seed is looked up
+in the index, it is only found if both strobes match. By changing the way in
+which the index is stored in v0.15.0, it became  possible to support
+*multi-context seeds*. With those changes, strobealign falls back to looking
+up only one of the strobes (a "partial seed") if the full seed cannot be found.
+This results in better mapping rate and accuracy for read lengths of up to
+about 200 nt.
+
+Usage of multi-context seeds is enabled by default in strobealign since v0.16.0.
+The strategy is to first search for all full seeds of the query and fall back to
+partial seeds if *no* seeds could be found.
+
+A slightly more accurate, but slower mode of using multi-context seeds is
+available by using option `--mcs`: With it, the strategy is changed to a
+fallback *per seed*: If an individual full seed cannot be found, its partial
+version is looked up in the index.
+
+
 ## Changelog
 
 See [Changelog](CHANGES.md).

diff --git a/src/cmdline.cpp b/src/cmdline.cpp
@@ -54,7 +54,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
     args::ValueFlag<int> end_bonus(parser, "INT", "Soft clipping penalty [10]", {'L'});
 
     args::Group search(parser, "Search parameters:");
-    args::Flag mcs(parser, "mcs", "Use multi-context seeds for finding hits", {"mcs"});
+    args::Flag mcs(parser, "mcs", "Use extended multi-context seed mode for finding hits. Slightly more accurate, but slower", {"mcs"});
     args::ValueFlag<float> f(parser, "FLOAT", "Top fraction of repetitive strobemers to filter out from sampling [0.0002]", {'f'});
     args::ValueFlag<float> S(parser, "FLOAT", "Try candidate sites with mapping score at least S of maximum mapping score [0.5]", {'S'});
     args::ValueFlag<int> M(parser, "INT", "Maximum number of mapping sites to try [20]", {'M'});

diff --git a/src/nam.cpp b/src/nam.cpp
@@ -219,6 +219,21 @@ std::tuple<float, int, std::vector<Nam>> find_nams(
         }
     }
 
+    // Rescue using partial hits, even in non-MCS mode
+    if (total_hits == 0 && !use_mcs) {
+        for (const auto &q : query_randstrobes) {
+            size_t partial_pos = index.find_partial(q.hash);
+            if (partial_pos != index.end()) {
+                total_hits++;
+                if (index.is_partial_filtered(partial_pos)) {
+                    continue;
+                }
+                nr_good_hits++;
+                add_to_matches_map_partial(matches_map[q.is_reverse], q.start, q.start + index.k(), index, partial_pos);
+            }
+        }
+    }
+
     float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
     auto nams = merge_matches_into_nams_forward_and_reverse(matches_map, index.k(), use_mcs);
     return {nonrepetitive_fraction, nr_good_hits, nams};
@@ -249,43 +264,34 @@ std::pair<int, std::vector<Nam>> find_nams_rescue(
         }
     };
     std::array<robin_hood::unordered_map<unsigned int, std::vector<Match>>, 2> matches_map;
-    std::vector<RescueHit> hits_fw;
-    std::vector<RescueHit> hits_rc;
+    std::array<std::vector<RescueHit>, 2> hits;
     matches_map[0].reserve(100);
     matches_map[1].reserve(100);
-    hits_fw.reserve(5000);
-    hits_rc.reserve(5000);
+    hits[0].reserve(5000);
+    hits[1].reserve(5000);
 
     for (auto &qr : query_randstrobes) {
         size_t position = index.find_full(qr.hash);
         if (position != index.end()) {
             unsigned int count = index.get_count_full(position);
             RescueHit rh{position, count, qr.start, qr.end, false};
-            if (qr.is_reverse){
-                hits_rc.push_back(rh);
-            } else {
-                hits_fw.push_back(rh);
-            }
+            hits[qr.is_reverse].push_back(rh);
         }
         else if (use_mcs) {
             size_t partial_pos = index.find_partial(qr.hash);
             if (partial_pos != index.end()) {
                 unsigned int partial_count = index.get_count_partial(partial_pos);
                 RescueHit rh{partial_pos, partial_count, qr.start, qr.start + index.k(), true};
-                if (qr.is_reverse){
-                    hits_rc.push_back(rh);
-                } else {
-                    hits_fw.push_back(rh);
-                }
+                hits[qr.is_reverse].push_back(rh);
             }
         }
     }
 
-    std::sort(hits_fw.begin(), hits_fw.end());
-    std::sort(hits_rc.begin(), hits_rc.end());
+    std::sort(hits[0].begin(), hits[0].end());
+    std::sort(hits[1].begin(), hits[1].end());
     int n_hits = 0;
     size_t is_revcomp = 0;
-    for (auto& rescue_hits : {hits_fw, hits_rc}) {
+    for (auto& rescue_hits : hits) {
         int cnt = 0;
         for (auto &rh : rescue_hits) {
             if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) {

diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
@@ -1 +1 @@
-7f5ac330f68afbe32e0cde4a32bbb786cdd4d01e
+acc4cffe5ac2c4db266c58d00b7b6462c6b4189c
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		7f5ac330f68afbe32e0cde4a32bbb786cdd4d01e
		acc4cffe5ac2c4db266c58d00b7b6462c6b4189c