From 54c5ef458ef70682340c7a95f6a017176d77df25 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Tue, 27 Aug 2024 09:21:12 +0200 Subject: [PATCH] Do not store parameter K as a constant, but a field. --- sa-index/src/bounds_table.rs | 22 +++++++++++---------- sa-index/src/sa_searcher.rs | 37 ++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/sa-index/src/bounds_table.rs b/sa-index/src/bounds_table.rs index 9785abc..a8fcf3c 100644 --- a/sa-index/src/bounds_table.rs +++ b/sa-index/src/bounds_table.rs @@ -1,15 +1,16 @@ -pub struct BoundsCache { +pub struct BoundsCache { pub bounds: Vec>, + pub base: usize, + pub k: usize, ascii_array: [usize; 128], powers_array: [usize; 10], - alphabet: Vec, - base: usize + alphabet: Vec } -impl BoundsCache { - pub fn new(alphabet: String) -> BoundsCache { - assert!(K < 10, "K must be less than 10"); +impl BoundsCache { + pub fn new(alphabet: String, k: usize) -> BoundsCache { + assert!(k < 10, "K must be less than 10"); let alphabet = alphabet.to_uppercase().as_bytes().to_vec(); let base = alphabet.len(); @@ -25,14 +26,15 @@ impl BoundsCache { } // 20^1 + 20^2 + 20^3 + ... + 20^(K) = (20^(K + 1) - 20) / 19 - let capacity = (base.pow(K + 1) - base) / (base - 1); + let capacity = (base.pow(k as u32 + 1) - base) / (base - 1); - BoundsCache { + Self { bounds: vec![None; capacity], ascii_array, powers_array, alphabet, - base + base, + k } } @@ -88,7 +90,7 @@ mod tests { #[test] fn test_bounds_cache() { - let kmer_cache = BoundsCache::<5>::new("ACDEFGHIKLMNPQRSTVWY".to_string()); + let kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string(), 5); for i in 0..20_usize.pow(5) { let kmer = kmer_cache.index_to_kmer(i); diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index cad8053..0814218 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,5 +1,4 @@ use std::{cmp::min, ops::Deref}; -use std::str::from_utf8; use sa_mappings::proteins::{Protein, Proteins}; use crate::{ @@ -75,9 +74,9 @@ impl PartialEq for SearchAllSuffixesResult { pub struct SparseSearcher(Searcher); impl SparseSearcher { - pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { + pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self { let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k); Self(searcher) } } @@ -93,9 +92,9 @@ impl Deref for SparseSearcher { pub struct DenseSearcher(Searcher); impl DenseSearcher { - pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { + pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self { let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k); Self(searcher) } } @@ -122,7 +121,7 @@ impl Deref for DenseSearcher { /// the functional analysis provided by Unipept pub struct Searcher { pub sa: SuffixArray, - pub kmer_cache: BoundsCache<5>, + pub kmer_cache: BoundsCache, pub proteins: Proteins, pub suffix_index_to_protein: Box } @@ -144,15 +143,15 @@ impl Searcher { /// # Returns /// /// Returns a new Searcher object - pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box) -> Self { + pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box, k: usize) -> Self { // Create a KTable with all possible 3-mers - let mut kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string()); + let mut kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string(), k); // Create the Searcher object let mut searcher = Self { sa, kmer_cache, proteins, suffix_index_to_protein }; // Update the bounds for all 3-mers in the KTable - for i in 0..20_usize.pow(5) { + for i in 0..searcher.kmer_cache.base.pow(k as u32) { let kmer = searcher.kmer_cache.index_to_kmer(i); // Calculate stricter starting bounds for the 3-mers @@ -308,7 +307,7 @@ impl Searcher { // Use the (up to) first 5 characters of the search string as the kmer // If the kmer is found in the cache, use the bounds from the cache as start bounds // to find the bounds of the entire string - let mut max_mer_length = min(5, search_string.len()); + let mut max_mer_length = min(self.kmer_cache.k, search_string.len()); if let Some(bounds) = self.kmer_cache.get_kmer(&search_string[..max_mer_length]) { return self.search_bounds_no_cache(search_string, bounds); } @@ -568,7 +567,7 @@ mod tests { let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3); // search bounds 'A' let bounds_res = searcher.search_bounds(&[b'A']); @@ -590,7 +589,7 @@ mod tests { let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3); // search suffix 'VAA' let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false); @@ -607,7 +606,7 @@ mod tests { let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3); let bounds_res = searcher.search_bounds(&[b'I']); assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16))); @@ -623,7 +622,7 @@ mod tests { let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3); // search bounds 'RIZ' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); @@ -650,7 +649,7 @@ mod tests { let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3); // search bounds 'IM' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true); @@ -672,7 +671,7 @@ mod tests { let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3); let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5])); @@ -693,7 +692,7 @@ mod tests { let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3); let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])); @@ -714,7 +713,7 @@ mod tests { let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3); // search all places where II is in the string IIIILL, but with a sparse SA // this way we check if filtering the suffixes works as expected @@ -737,7 +736,7 @@ mod tests { let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); - let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3); // search bounds 'IM' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);