Skip to content

Commit

Permalink
Merge branch 'feature/sa-bounds-optimization' into feature/split-fa-s…
Browse files Browse the repository at this point in the history
…ummary
  • Loading branch information
tibvdm committed Aug 28, 2024
2 parents 8067226 + 352bcb7 commit 0bdd1f2
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 26 deletions.
125 changes: 125 additions & 0 deletions sa-index/src/bounds_cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
pub struct BoundsCache {
pub bounds: Vec<Option<(usize, usize)>>,
pub base: usize,
pub k: usize,

ascii_array: [usize; 128],
powers_array: [usize; 10],
offsets_array: [usize; 10],
alphabet: Vec<u8>
}

impl BoundsCache {
pub fn new(alphabet: String, k: usize) -> BoundsCache {
assert!(k < 10, "K must be less than 10");

let alphabet = alphabet.to_uppercase().as_bytes().to_vec();
let base = alphabet.len();

let mut ascii_array: [usize; 128] = [0; 128];
for (i, byte) in alphabet.iter().enumerate() {
ascii_array[*byte as usize] = i;
}
//ascii_array[b'L' as usize] = ascii_array[b'I' as usize]; // I and L are treated as the same amino acid

let mut powers_array = [0; 10];
for i in 0..10 {
powers_array[i] = base.pow(i as u32);
}

let mut offsets_array = [0; 10];
for i in 2..10 {
offsets_array[i] = offsets_array[i - 1] + powers_array[i - 1];
}

// 20^1 + 20^2 + 20^3 + ... + 20^(K) = (20^(K + 1) - 20) / 19
let capacity = (base.pow(k as u32 + 1) - base) / (base - 1);

Self {
bounds: vec![None; capacity],
ascii_array,
powers_array,
offsets_array,
alphabet,
base,
k
}
}

pub fn get_kmer(&self, kmer: &[u8]) -> Option<(usize, usize)> {
self.bounds.get(self.kmer_to_index(kmer)).cloned()?
}

pub fn update_kmer(&mut self, kmer: &[u8], bounds: (usize, usize)) {
let index = self.kmer_to_index(kmer);
self.bounds[index] = Some(bounds);
}

pub fn index_to_kmer(&self, mut index: usize) -> Vec<u8> {
if index < self.base {
return vec![self.alphabet[index]];
}

// Calculate the length of the kmer
let mut length = 2;
while self.offsets_array[length + 1] <= index {
length += 1;
}

// Calculate the offset of the kmer
let offset = self.offsets_array[length];

// Translate the index to be inside the bounds [0, 20^k)
index -= offset;

// Basic conversion from base 10 to base `length`
let mut kmer = vec![0; length];
for i in 0..length {
kmer[length - i - 1] = self.alphabet[index % self.base];
index /= self.base;
}

kmer
}

fn kmer_to_index(&self, kmer: &[u8]) -> usize {
if kmer.len() == 1 {
return self.ascii_array[kmer[0] as usize];
}

let mut result = 0;
for i in 0..kmer.len() {
result += (self.ascii_array[kmer[i] as usize] + 1) * self.powers_array[kmer.len() - i - 1];
}

result - 1
}
}

#[cfg(test)]
mod tests {
use std::str::from_utf8;
use super::*;

#[test]
fn test_bounds_cache() {
let kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string(), 5);

for i in 0..40 {
let kmer = kmer_cache.index_to_kmer(i);
eprintln!("{} -> {:?} -> {:?}", i, from_utf8(&kmer).unwrap(), kmer_cache.kmer_to_index(&kmer));
}

for i in 0..20_usize.pow(5) {
let kmer = kmer_cache.index_to_kmer(i);

if kmer.contains(&b'L') {
let equated_kmer = kmer.iter().map(|&c| if c == b'L' { b'I' } else { c }).collect::<Vec<u8>>();
assert_eq!(kmer_cache.kmer_to_index(&kmer), kmer_cache.kmer_to_index(&equated_kmer));
continue;
}

assert_eq!(kmer_cache.kmer_to_index(&kmer), i);
}
}
}
1 change: 1 addition & 0 deletions sa-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod binary;
pub mod peptide_search;
pub mod sa_searcher;
pub mod suffix_to_protein_index;
mod bounds_cache;

/// Represents a suffix array.
pub enum SuffixArray {
Expand Down
109 changes: 83 additions & 26 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use std::{cmp::min, ops::Deref};

use std::{cmp::min, cmp::max, ops::Deref};
use std::time::{SystemTime, SystemTimeError, UNIX_EPOCH};
use sa_mappings::proteins::{Protein, Proteins};

use crate::{
sa_searcher::BoundSearch::{Maximum, Minimum},
suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex},
Nullable, SuffixArray
};
use crate::bounds_cache::BoundsCache;

/// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array
#[derive(Clone, Copy, PartialEq)]
Expand Down Expand Up @@ -74,9 +75,9 @@ impl PartialEq for SearchAllSuffixesResult {
pub struct SparseSearcher(Searcher);

impl SparseSearcher {
pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k);
Self(searcher)
}
}
Expand All @@ -92,9 +93,9 @@ impl Deref for SparseSearcher {
pub struct DenseSearcher(Searcher);

impl DenseSearcher {
pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self {
let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k);
Self(searcher)
}
}
Expand All @@ -121,6 +122,7 @@ impl Deref for DenseSearcher {
/// the functional analysis provided by Unipept
pub struct Searcher {
pub sa: SuffixArray,
pub kmer_cache: BoundsCache,
pub proteins: Proteins,
pub suffix_index_to_protein: Box<dyn SuffixToProteinIndex>
}
Expand All @@ -142,8 +144,41 @@ impl Searcher {
/// # Returns
///
/// Returns a new Searcher object
pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box<dyn SuffixToProteinIndex>) -> Self {
Self { sa, proteins, suffix_index_to_protein }
pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box<dyn SuffixToProteinIndex>, k: usize) -> Self {
// Create a KTable with all possible 3-mers
let kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string(), k);

// Create the Searcher object
let mut searcher = Self { sa, kmer_cache, proteins, suffix_index_to_protein };

// Update the bounds for all 3-mers in the KTable
let base = searcher.kmer_cache.base;
let length = (base.pow(k as u32 + 1) - base) / (base - 1);

let print_step_size = max(length / 100, searcher.kmer_cache.base);

eprintln!("Starting cache fill");
let start_cache_fill_time = get_time_ms().unwrap();

for i in 0..length {
if i % print_step_size == 0 {
eprintln!("Updating kmer cache: {}% ({} seconds since start)", i / print_step_size, (get_time_ms().unwrap() - start_cache_fill_time) / 1000.0);
}

let kmer = searcher.kmer_cache.index_to_kmer(i);

let bounds = searcher.search_bounds_no_cache(&kmer, (0, searcher.sa.len()), 0);

if let BoundSearchResult::SearchResult((min_bound, max_bound)) = bounds {
let min_bound = if min_bound == 0 { 0 } else { min_bound - 1 };
let max_bound = if max_bound == searcher.sa.len() { max_bound } else { max_bound + 1 };
searcher.kmer_cache.update_kmer(&kmer, (min_bound, max_bound));
}
}

eprintln!("Filled cache in {} seconds", (get_time_ms().unwrap() - start_cache_fill_time) / 1000.0);

searcher
}

/// Compares the `search_string` to the `suffix`
Expand Down Expand Up @@ -224,10 +259,9 @@ impl Searcher {
/// The first argument is true if a match was found
/// The second argument indicates the index of the minimum or maximum bound for the match
/// (depending on `bound`)
fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8]) -> (bool, usize) {
let mut left: usize = 0;
let mut right: usize = self.sa.len();
let mut lcp_left: usize = 0;
fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8], start_bounds: (usize, usize), lcp_left: usize) -> (bool, usize) {
let (mut left, mut right) = start_bounds;
let mut lcp_left: usize = lcp_left;
let mut lcp_right: usize = 0;
let mut found = false;

Expand Down Expand Up @@ -278,13 +312,31 @@ impl Searcher {
/// Returns the minimum and maximum bound of all matches in the suffix array, or `NoMatches` if
/// no matches were found
pub fn search_bounds(&self, search_string: &[u8]) -> BoundSearchResult {
let (found_min, min_bound) = self.binary_search_bound(Minimum, search_string);
// If the string is empty, we don't need to search as nothing can be matched
if search_string.is_empty() {
return BoundSearchResult::NoMatches;
}

// Do a quick lookup in the kmer cache
// Use the (up to) first 5 characters of the search string as the kmer
// If the kmer is found in the cache, use the bounds from the cache as start bounds
// to find the bounds of the entire string
let max_mer_length = min(self.kmer_cache.k, search_string.len());
if let Some(bounds) = self.kmer_cache.get_kmer(&search_string[..max_mer_length]) {
return self.search_bounds_no_cache(search_string, bounds, max_mer_length);
}

BoundSearchResult::NoMatches
}

pub fn search_bounds_no_cache(&self, search_string: &[u8], start_bounds: (usize, usize), lcp_left: usize) -> BoundSearchResult {
let (found_min, min_bound) = self.binary_search_bound(Minimum, search_string, start_bounds, lcp_left);

if !found_min {
return BoundSearchResult::NoMatches;
}

let (_, max_bound) = self.binary_search_bound(Maximum, search_string);
let (_, max_bound) = self.binary_search_bound(Maximum, search_string, start_bounds, lcp_left);

BoundSearchResult::SearchResult((min_bound, max_bound + 1))
}
Expand Down Expand Up @@ -325,7 +377,7 @@ impl Searcher {
let il_locations_current_suffix = &il_locations[il_locations_start..];
let current_search_string_prefix = &search_string[..skip];
let current_search_string_suffix = &search_string[skip..];
let search_bound_result = self.search_bounds(&search_string[skip..]);
let search_bound_result = self.search_bounds(current_search_string_suffix);
// if the shorter part is matched, see if what goes before the matched suffix matches
// the unmatched part of the prefix
if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result {
Expand Down Expand Up @@ -456,6 +508,10 @@ impl Searcher {
}
}

pub fn get_time_ms() -> Result<f64, SystemTimeError> {
Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6)
}

#[cfg(test)]
mod tests {
use sa_mappings::proteins::{Protein, Proteins};
Expand Down Expand Up @@ -521,15 +577,16 @@ mod tests {
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'A'
let bounds_res = searcher.search_bounds(&[b'A']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((4, 9)));

// search bounds '$'
let bounds_res = searcher.search_bounds(&[b'$']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((0, 1)));
// TODO: do we want to keep this??
// let bounds_res = searcher.search_bounds(&[b'$']);
// assert_eq!(bounds_res, BoundSearchResult::SearchResult((0, 1)));

// search bounds 'AC'
let bounds_res = searcher.search_bounds(&[b'A', b'C']);
Expand All @@ -542,7 +599,7 @@ mod tests {
let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

// search suffix 'VAA'
let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
Expand All @@ -559,7 +616,7 @@ mod tests {
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

let bounds_res = searcher.search_bounds(&[b'I']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16)));
Expand All @@ -575,7 +632,7 @@ mod tests {
let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);

let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'RIZ' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true);
Expand All @@ -602,7 +659,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
Expand All @@ -624,7 +681,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]));
Expand All @@ -645,7 +702,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
Expand All @@ -666,7 +723,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

// search all places where II is in the string IIIILL, but with a sparse SA
// this way we check if filtering the suffixes works as expected
Expand All @@ -689,7 +746,7 @@ mod tests {

let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein), 3);

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
Expand Down

0 comments on commit 0bdd1f2

Please sign in to comment.