Skip to content

Commit

Permalink
first bad implementation for a kmer lookup cache
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed Aug 26, 2024
1 parent f15e3f8 commit b95b9f9
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 4 deletions.
88 changes: 88 additions & 0 deletions sa-index/src/bounds_table.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
pub struct BoundsCache<const K: u32> {
pub bounds: Vec<Option<(usize, usize)>>,

ascii_array: [usize; 128],
alphabet: Vec<u8>,
base: usize
}

impl<const K: u32> BoundsCache<K> {
pub fn new(alphabet: String) -> BoundsCache<K> {
let alphabet = alphabet.to_uppercase().as_bytes().to_vec();
let base = alphabet.len() + 1;

let mut ascii_array: [usize; 128] = [0; 128];
for (i, byte) in alphabet.iter().enumerate() {
// Add 1 to the index, so we can reserve a different value for the 0 index
ascii_array[*byte as usize] = i + 1;
}

BoundsCache {
bounds: vec![None; base.pow(K)],
ascii_array,
alphabet,
base
}
}

pub fn get_kmer(&self, kmer: &[u8]) -> Option<(usize, usize)> {
self.bounds.get(self.kmer_to_index(kmer)).cloned()?
}

pub fn update_all_kmers(&mut self, kmer: &[u8], bounds: (usize, usize)) {
let index = self.kmer_to_index(kmer);
self.bounds[index] = Some(bounds);
}

pub fn update_kmer(&mut self, kmer: &[u8], bounds: (usize, usize)) {
let index = self.kmer_to_index(kmer);
self.bounds[index] = Some(bounds);
}

pub fn index_to_kmer(&self, mut index: usize) -> Vec<u8> {
let mut kmer = Vec::with_capacity(K as usize);

for _ in 0..K {
let modulo = index % self.base;
if modulo == 0 {
return kmer.iter().rev().cloned().collect();
}
kmer.push(self.alphabet[modulo - 1]);

index /= self.base;
}

kmer.iter().rev().cloned().collect()
}

fn kmer_to_index(&self, kmer: &[u8]) -> usize {
kmer
.iter()
.rev()
.enumerate()
.map(|(i, n)| self.ascii_array[*n as usize] * self.base.pow(i as u32))
.sum()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_bounds_cache() {
let kmer_cache = BoundsCache::<5>::new("ACDEFGHIKLMNPQRSTVWY".to_string());

assert_eq!(kmer_cache.kmer_to_index("A".as_bytes()), 1);
assert_eq!(kmer_cache.kmer_to_index("C".as_bytes()), 2);
assert_eq!(kmer_cache.kmer_to_index("AA".as_bytes()), 22);
assert_eq!(kmer_cache.kmer_to_index("DEMY".as_bytes()), 29798);
assert_eq!(kmer_cache.kmer_to_index("DEMYS".as_bytes()), 625774);

// assert_eq!(kmer_cache.index_to_kmer(1), "A".as_bytes());
// assert_eq!(kmer_cache.index_to_kmer(2), "C".as_bytes());
// assert_eq!(kmer_cache.index_to_kmer(22), "AA".as_bytes());
assert_eq!(kmer_cache.index_to_kmer(29798), "DEMY".as_bytes());
assert_eq!(kmer_cache.index_to_kmer(625774), "DEMYS".as_bytes());
}
}
1 change: 1 addition & 0 deletions sa-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod binary;
pub mod peptide_search;
pub mod sa_searcher;
pub mod suffix_to_protein_index;
mod bounds_table;

/// Represents a suffix array.
pub enum SuffixArray {
Expand Down
31 changes: 27 additions & 4 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use std::{cmp::min, ops::Deref};

use sa_mappings::proteins::{Protein, Proteins};

use crate::{
sa_searcher::BoundSearch::{Maximum, Minimum},
suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex},
Nullable, SuffixArray
};
use crate::bounds_table::BoundsCache;

/// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array
#[derive(Clone, Copy, PartialEq)]
Expand Down Expand Up @@ -121,6 +121,7 @@ impl Deref for DenseSearcher {
/// the functional analysis provided by Unipept
pub struct Searcher {
pub sa: SuffixArray,
pub kmer_cache: BoundsCache<3>,
pub proteins: Proteins,
pub suffix_index_to_protein: Box<dyn SuffixToProteinIndex>
}
Expand All @@ -143,7 +144,30 @@ impl Searcher {
///
/// Returns a new Searcher object
pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box<dyn SuffixToProteinIndex>) -> Self {
Self { sa, proteins, suffix_index_to_protein }
// Create a KTable with all possible 3-mers
let mut kmer_cache = BoundsCache::new("ACDEFGHIKLMNPQRSTVWY".to_string());

Check warning on line 148 in sa-index/src/sa_searcher.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable

Check warning on line 148 in sa-index/src/sa_searcher.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable

// Create the Searcher object
let mut searcher = Self { sa, kmer_cache, proteins, suffix_index_to_protein };

// Update the bounds for all 3-mers in the KTable
for i in 0..21_usize.pow(3) {
let kmer = searcher.kmer_cache.index_to_kmer(i);

if kmer.is_empty() || searcher.kmer_cache.get_kmer(&kmer).is_some() {
continue;
}

// Calculate stricter starting bounds for the 3-mers
let bounds = searcher.search_bounds(&kmer);

if let BoundSearchResult::SearchResult((min_bound, max_bound)) = bounds {
let min_bound = if min_bound == 0 { 0 } else { min_bound - 1 };
searcher.kmer_cache.update_kmer(&kmer, (min_bound, max_bound));
}
}

searcher
}

/// Compares the `search_string` to the `suffix`
Expand Down Expand Up @@ -225,8 +249,7 @@ impl Searcher {
/// The second argument indicates the index of the minimum or maximum bound for the match
/// (depending on `bound`)
fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8]) -> (bool, usize) {
let mut left: usize = 0;
let mut right: usize = self.sa.len();
let (mut left, mut right) = self.kmer_cache.get_kmer(search_string).unwrap_or((0, self.sa.len()));
let mut lcp_left: usize = 0;
let mut lcp_right: usize = 0;
let mut found = false;
Expand Down

0 comments on commit b95b9f9

Please sign in to comment.