From 96e746e348d4ec38e6d3bfb65d04b5a3fa696e6c Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 30 Aug 2024 15:01:05 +0200 Subject: [PATCH] use a fast-rank bitvector implementation to find the proteins --- sa-index/Cargo.toml | 1 + sa-index/src/sa_searcher.rs | 19 +++++++++++ sa-index/src/suffix_to_protein_index.rs | 43 +++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index de57fc9..cf2d524 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -16,3 +16,4 @@ serde = { version = "1.0.197", features = ["derive"] } sa-mappings = { path = "../sa-mappings" } bitarray = { path = "../bitarray" } serde_json = "1.0.116" +vers-vecs = "1.5.0" diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index e00d216..757c7b8 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -8,6 +8,7 @@ use crate::{ Nullable, SuffixArray }; use crate::bounds_cache::BoundsCache; +use crate::suffix_to_protein_index::RankSuffixToProtein; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array #[derive(Clone, Copy, PartialEq)] @@ -108,6 +109,24 @@ impl Deref for DenseSearcher { } } +pub struct RankSearcher(Searcher); + +impl RankSearcher { + pub fn new(sa: SuffixArray, proteins: Proteins, k: usize) -> Self { + let suffix_index_to_protein = RankSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein), k); + Self(searcher) + } +} + +impl Deref for RankSearcher { + type Target = Searcher; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// Struct that contains all the elements needed to search a peptide in the suffix array /// This struct also contains all the functions used for search /// diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 121b569..743fc24 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -1,4 +1,5 @@ use clap::ValueEnum; +use vers_vecs::{BitVec, RsVec}; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; use crate::Nullable; @@ -38,6 +39,10 @@ pub struct SparseSuffixToProtein { mapping: Vec } +pub struct RankSuffixToProtein { + mapping: RsVec +} + impl SuffixToProteinIndex for DenseSuffixToProtein { fn suffix_to_protein(&self, suffix: i64) -> u32 { self.mapping[suffix as usize] @@ -57,6 +62,16 @@ impl SuffixToProteinIndex for SparseSuffixToProtein { } } +impl SuffixToProteinIndex for RankSuffixToProtein { + fn suffix_to_protein(&self, suffix: i64) -> u32 { + if let Some(1) = self.mapping.get(suffix as usize) { + return u32::NULL; + } + + self.mapping.rank1(suffix as usize) as u32 + } +} + impl DenseSuffixToProtein { /// Creates a new DenseSuffixToProtein mapping /// @@ -104,6 +119,21 @@ impl SparseSuffixToProtein { } } +impl RankSuffixToProtein { + pub fn new(text: &[u8]) -> Self { + let mut bit_vector = BitVec::from_zeros(text.len()); + for (index, &char) in text.iter().enumerate() { + if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { + bit_vector.set(index, 1).unwrap(); + } + } + + Self { + mapping: RsVec::from_bit_vec(bit_vector) + } + } +} + #[cfg(test)] mod tests { use clap::ValueEnum; @@ -115,6 +145,7 @@ mod tests { }, Nullable }; + use crate::suffix_to_protein_index::RankSuffixToProtein; fn build_text() -> Vec { let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char)); @@ -172,4 +203,16 @@ mod tests { // suffix that starts with TERMINATION_CHARACTER assert_eq!(index.suffix_to_protein(10), u32::NULL); } + + #[test] + fn test_search_rank() { + let u8_text = &build_text(); + let index = RankSuffixToProtein::new(u8_text); + assert_eq!(index.suffix_to_protein(5), 1); + assert_eq!(index.suffix_to_protein(7), 2); + // suffix that starts with SEPARATION_CHARACTER + assert_eq!(index.suffix_to_protein(3), u32::NULL); + // suffix that starts with TERMINATION_CHARACTER + assert_eq!(index.suffix_to_protein(10), u32::NULL); + } }