Skip to content

Commit

Permalink
use a fast-rank bitvector implementation to find the proteins
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed Aug 30, 2024
1 parent ad9b885 commit 96e746e
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 0 deletions.
1 change: 1 addition & 0 deletions sa-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ serde = { version = "1.0.197", features = ["derive"] }
sa-mappings = { path = "../sa-mappings" }
bitarray = { path = "../bitarray" }
serde_json = "1.0.116"
vers-vecs = "1.5.0"
19 changes: 19 additions & 0 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::{
Nullable, SuffixArray
};
use crate::bounds_cache::BoundsCache;
use crate::suffix_to_protein_index::RankSuffixToProtein;

/// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array
#[derive(Clone, Copy, PartialEq)]
Expand Down Expand Up @@ -108,6 +109,24 @@ impl Deref for DenseSearcher {
}
}

pub struct RankSearcher(Searcher);

impl RankSearcher {
pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self {
let suffix_index_to_protein = RankSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k);
Self(searcher)
}
}

impl Deref for RankSearcher {
type Target = Searcher;

fn deref(&self) -> &Self::Target {
&self.0
}
}

/// Struct that contains all the elements needed to search a peptide in the suffix array
/// This struct also contains all the functions used for search
///
Expand Down
43 changes: 43 additions & 0 deletions sa-index/src/suffix_to_protein_index.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use clap::ValueEnum;
use vers_vecs::{BitVec, RsVec};
use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};

use crate::Nullable;
Expand Down Expand Up @@ -38,6 +39,10 @@ pub struct SparseSuffixToProtein {
mapping: Vec<i64>
}

pub struct RankSuffixToProtein {
mapping: RsVec
}

impl SuffixToProteinIndex for DenseSuffixToProtein {
fn suffix_to_protein(&self, suffix: i64) -> u32 {
self.mapping[suffix as usize]
Expand All @@ -57,6 +62,16 @@ impl SuffixToProteinIndex for SparseSuffixToProtein {
}
}

impl SuffixToProteinIndex for RankSuffixToProtein {
fn suffix_to_protein(&self, suffix: i64) -> u32 {
if let Some(1) = self.mapping.get(suffix as usize) {
return u32::NULL;
}

self.mapping.rank1(suffix as usize) as u32
}
}

impl DenseSuffixToProtein {
/// Creates a new DenseSuffixToProtein mapping
///
Expand Down Expand Up @@ -104,6 +119,21 @@ impl SparseSuffixToProtein {
}
}

impl RankSuffixToProtein {
pub fn new(text: &[u8]) -> Self {
let mut bit_vector = BitVec::from_zeros(text.len());
for (index, &char) in text.iter().enumerate() {
if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
bit_vector.set(index, 1).unwrap();
}
}

Self {
mapping: RsVec::from_bit_vec(bit_vector)
}
}
}

#[cfg(test)]
mod tests {
use clap::ValueEnum;
Expand All @@ -115,6 +145,7 @@ mod tests {
},
Nullable
};
use crate::suffix_to_protein_index::RankSuffixToProtein;

fn build_text() -> Vec<u8> {
let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char));
Expand Down Expand Up @@ -172,4 +203,16 @@ mod tests {
// suffix that starts with TERMINATION_CHARACTER
assert_eq!(index.suffix_to_protein(10), u32::NULL);
}

#[test]
fn test_search_rank() {
let u8_text = &build_text();
let index = RankSuffixToProtein::new(u8_text);
assert_eq!(index.suffix_to_protein(5), 1);
assert_eq!(index.suffix_to_protein(7), 2);
// suffix that starts with SEPARATION_CHARACTER
assert_eq!(index.suffix_to_protein(3), u32::NULL);
// suffix that starts with TERMINATION_CHARACTER
assert_eq!(index.suffix_to_protein(10), u32::NULL);
}
}

0 comments on commit 96e746e

Please sign in to comment.