diff --git a/sa-index/src/bounds_table.rs b/sa-index/src/bounds_table.rs new file mode 100644 index 0000000..c1ac418 --- /dev/null +++ b/sa-index/src/bounds_table.rs @@ -0,0 +1,88 @@ +pub struct BoundsCache { + pub bounds: Vec>, + + ascii_array: [usize; 128], + alphabet: Vec, + base: usize +} + +impl BoundsCache { + pub fn new(alphabet: String) -> BoundsCache { + let alphabet = alphabet.to_uppercase().as_bytes().to_vec(); + let base = alphabet.len() + 1; + + let mut ascii_array: [usize; 128] = [0; 128]; + for (i, byte) in alphabet.iter().enumerate() { + // Add 1 to the index, so we can reserve a different value for the 0 index + ascii_array[*byte as usize] = i + 1; + } + + BoundsCache { + bounds: vec![None; base.pow(K)], + ascii_array, + alphabet, + base + } + } + + pub fn get_kmer(&self, kmer: &[u8]) -> Option<(usize, usize)> { + self.bounds.get(self.kmer_to_index(kmer)).cloned()? + } + + pub fn update_all_kmers(&mut self, kmer: &[u8], bounds: (usize, usize)) { + let index = self.kmer_to_index(kmer); + self.bounds[index] = Some(bounds); + } + + pub fn update_kmer(&mut self, kmer: &[u8], bounds: (usize, usize)) { + let index = self.kmer_to_index(kmer); + self.bounds[index] = Some(bounds); + } + + pub fn index_to_kmer(&self, mut index: usize) -> Vec { + let mut kmer = Vec::with_capacity(K as usize); + + for _ in 0..K { + let modulo = index % self.base; + if modulo == 0 { + return kmer.iter().rev().cloned().collect(); + } + kmer.push(self.alphabet[modulo - 1]); + + index /= self.base; + } + + kmer.iter().rev().cloned().collect() + } + + fn kmer_to_index(&self, kmer: &[u8]) -> usize { + kmer + .iter() + .rev() + .enumerate() + .map(|(i, n)| self.ascii_array[*n as usize] * self.base.pow(i as u32)) + .sum() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bounds_cache() { + let kmer_cache = BoundsCache::<5>::new("ACDEFGHIKLMNPQRSTVWY".to_string()); + + assert_eq!(kmer_cache.kmer_to_index("A".as_bytes()), 1); + assert_eq!(kmer_cache.kmer_to_index("C".as_bytes()), 2); + assert_eq!(kmer_cache.kmer_to_index("AA".as_bytes()), 22); + assert_eq!(kmer_cache.kmer_to_index("DEMY".as_bytes()), 29798); + assert_eq!(kmer_cache.kmer_to_index("DEMYS".as_bytes()), 625774); + + // assert_eq!(kmer_cache.index_to_kmer(1), "A".as_bytes()); + // assert_eq!(kmer_cache.index_to_kmer(2), "C".as_bytes()); + // assert_eq!(kmer_cache.index_to_kmer(22), "AA".as_bytes()); + assert_eq!(kmer_cache.index_to_kmer(29798), "DEMY".as_bytes()); + assert_eq!(kmer_cache.index_to_kmer(625774), "DEMYS".as_bytes()); + } +} diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index f276906..8c47f84 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -4,6 +4,7 @@ pub mod binary; pub mod peptide_search; pub mod sa_searcher; pub mod suffix_to_protein_index; +mod bounds_table; /// Represents a suffix array. pub enum SuffixArray { diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index d09c704..5f54987 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,5 +1,4 @@ use std::{cmp::min, ops::Deref}; - use sa_mappings::proteins::{Protein, Proteins}; use crate::{ @@ -7,6 +6,7 @@ use crate::{ suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}, Nullable, SuffixArray }; +use crate::bounds_table::BoundsCache; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array #[derive(Clone, Copy, PartialEq)] @@ -121,6 +121,7 @@ impl Deref for DenseSearcher { /// the functional analysis provided by Unipept pub struct Searcher { pub sa: SuffixArray, + pub kmer_cache: BoundsCache<3>, pub proteins: Proteins, pub suffix_index_to_protein: Box } @@ -143,7 +144,30 @@ impl Searcher { /// /// Returns a new Searcher object pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box) -> Self { - Self { sa, proteins, suffix_index_to_protein } + // Create a KTable with all possible 3-mers + let mut kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string()); + + // Create the Searcher object + let mut searcher = Self { sa, kmer_cache, proteins, suffix_index_to_protein }; + + // Update the bounds for all 3-mers in the KTable + for i in 0..21_usize.pow(3) { + let kmer = searcher.kmer_cache.index_to_kmer(i); + + if kmer.is_empty() || searcher.kmer_cache.get_kmer(&kmer).is_some() { + continue; + } + + // Calculate stricter starting bounds for the 3-mers + let bounds = searcher.search_bounds(&kmer); + + if let BoundSearchResult::SearchResult((min_bound, max_bound)) = bounds { + let min_bound = if min_bound == 0 { 0 } else { min_bound - 1 }; + searcher.kmer_cache.update_kmer(&kmer, (min_bound, max_bound)); + } + } + + searcher } /// Compares the `search_string` to the `suffix` @@ -225,8 +249,7 @@ impl Searcher { /// The second argument indicates the index of the minimum or maximum bound for the match /// (depending on `bound`) fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8]) -> (bool, usize) { - let mut left: usize = 0; - let mut right: usize = self.sa.len(); + let (mut left, mut right) = self.kmer_cache.get_kmer(search_string).unwrap_or((0, self.sa.len())); let mut lcp_left: usize = 0; let mut lcp_right: usize = 0; let mut found = false;