diff --git a/Cargo.toml b/Cargo.toml index fec447c..c53d905 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,8 @@ resolver = "2" members = [ "bitarray", "fa-compression", "libsais64-rs", - "sa-builder", "sa-compression", + "sa-builder", + "sa-compression", "sa-index", "sa-mappings", "sa-server" diff --git a/fa-compression/benches/algorithm2/decode.rs b/fa-compression/benches/algorithm2/decode.rs index 4d562fc..bc14527 100644 --- a/fa-compression/benches/algorithm2/decode.rs +++ b/fa-compression/benches/algorithm2/decode.rs @@ -1,5 +1,5 @@ use criterion::black_box; -use fa_compression::algorithm2::{decode, encode, CompressionTable}; +use fa_compression::algorithm2::{CompressionTable, decode, encode}; use super::util::generate_annotation; diff --git a/fa-compression/benches/algorithm2/encode.rs b/fa-compression/benches/algorithm2/encode.rs index 827dd50..406487f 100644 --- a/fa-compression/benches/algorithm2/encode.rs +++ b/fa-compression/benches/algorithm2/encode.rs @@ -1,5 +1,5 @@ use criterion::black_box; -use fa_compression::algorithm2::{encode, CompressionTable}; +use fa_compression::algorithm2::{CompressionTable, encode}; use super::util::generate_annotation; diff --git a/fa-compression/benches/util.rs b/fa-compression/benches/util.rs index b6ddd9a..fee2359 100644 --- a/fa-compression/benches/util.rs +++ b/fa-compression/benches/util.rs @@ -1,4 +1,4 @@ -use rand::{rngs::ThreadRng, Rng}; +use rand::{Rng, rngs::ThreadRng}; /// Generate a random InterPro annotation. pub fn generate_ipr(random: &mut ThreadRng) -> String { diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs index 5b3feb2..3d52f03 100644 --- a/libsais64-rs/builder.rs +++ b/libsais64-rs/builder.rs @@ -53,7 +53,7 @@ fn main() -> Result<(), Box> { // clone the c library Command::new("git") - .args(["clone", "https://github.com/IlyaGrebnov/libsais.git", "--depth=1"]) + .args(["clone", "https://github.com/unipept/unipept-libsais.git", "libsais", "--depth=1"]) .status() .expect("Failed to clone the libsais repository"); diff --git a/libsais64-rs/libsais-wrapper.h b/libsais64-rs/libsais-wrapper.h index fbfe0b9..aa0a21d 100644 --- a/libsais64-rs/libsais-wrapper.h +++ b/libsais64-rs/libsais-wrapper.h @@ -1,4 +1,8 @@ -#include "libsais/include/libsais64.h" +#include "libsais/include/libsais16x64.h" +int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); + +int64_t libsais32x64(const uint32_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t * freq); + int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); \ No newline at end of file diff --git a/libsais64-rs/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs new file mode 100644 index 0000000..ba601a6 --- /dev/null +++ b/libsais64-rs/src/bitpacking.rs @@ -0,0 +1,107 @@ +// Function to get the rank of a character +fn get_rank(c: u8) -> u8 { + match c { + b'$' => 0, + b'-' => 1, + _ => 2 + (c - b'A') + } +} + +// Amount of bits necessary to represent one character in the protein text. +pub const BITS_PER_CHAR: usize = 5; + +// Bitpack text in a vector of u8 elements. BITS_PER_CHAR * sparseness_factor <= 8. +pub fn bitpack_text_8(text: Vec, sparseness_factor: usize) -> Vec { + assert!(BITS_PER_CHAR * sparseness_factor <= 8); + + let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; + let mut text_packed = vec![0; num_ints]; + + if text.is_empty() { + return text_packed; + } + + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) { + let ti = i * sparseness_factor; + *element = 0u8; + for j in 0..sparseness_factor { + let rank_c = get_rank(text[ti + j]); + *element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + } + } + + // Handle the last element + let mut last_element = 0u8; + let last_el_start = sparseness_factor * (num_ints - 1); + for i in 0..((text.len() - 1) % sparseness_factor + 1) { + let rank_c = get_rank(text[last_el_start + i]); + last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); + } + text_packed[num_ints - 1] = last_element; + + text_packed +} + +// Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 16. +pub fn bitpack_text_16(text: Vec, sparseness_factor: usize) -> Vec { + assert!(BITS_PER_CHAR * sparseness_factor <= 16); + + let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; + let mut text_packed = vec![0; num_ints]; + + if text.is_empty() { + return text_packed; + } + + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) { + let ti = i * sparseness_factor; + *element = 0u16; + for j in 0..sparseness_factor { + let rank_c = get_rank(text[ti + j]) as u16; + *element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + } + } + + // Handle the last element + let mut last_element = 0u16; + let last_el_start = sparseness_factor * (num_ints - 1); + for i in 0..((text.len() - 1) % sparseness_factor + 1) { + let rank_c = get_rank(text[last_el_start + i]) as u16; + last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); + } + text_packed[num_ints - 1] = last_element; + + text_packed +} + +// Bitpack text in a vector of u32 elements. BITS_PER_CHAR * sparseness_factor <= 32. +pub fn bitpack_text_32(text: Vec, sparseness_factor: usize) -> Vec { + assert!(BITS_PER_CHAR * sparseness_factor <= 32); + + let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; + let mut text_packed = vec![0; num_ints]; + + if text.is_empty() { + return text_packed; + } + + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) { + let ti = i * sparseness_factor; + *element = 0u32; + for j in 0..sparseness_factor { + let rank_c = get_rank(text[ti + j]) as u32; + *element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + } + } + + // Handle the last element + let mut last_element = 0u32; + let last_el_start = sparseness_factor * (num_ints - 1); + for i in 0..((text.len() - 1) % sparseness_factor + 1) { + let rank_c = get_rank(text[last_el_start + i]) as u32; + last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); + } + text_packed[num_ints - 1] = last_element; + + text_packed +} diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index e2a87f6..8c9011c 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -2,9 +2,14 @@ #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] +use std::ptr::null_mut; + +use crate::bitpacking::{BITS_PER_CHAR, bitpack_text_8, bitpack_text_16, bitpack_text_32}; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); -/// Builds the suffix array over the `text` using the libsais64 algorithm +pub mod bitpacking; + +/// Builds the suffix array over the `text` using the libsais algorithm /// /// # Arguments /// * `text` - The text used for suffix array construction @@ -13,10 +18,41 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64(text: &[u8]) -> Option> { - let mut sa = vec![0; text.len()]; - let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) }; - if exit_code == 0 { Some(sa) } else { None } +pub fn sais64(text: Vec, libsais_sparseness: usize) -> Result, &'static str> { + let exit_code; + let mut sa; + + let required_bits = libsais_sparseness * BITS_PER_CHAR; + if required_bits <= 8 { + // bitpacked values fit in uint8_t + let packed_text = if libsais_sparseness == 1 { text } else { bitpack_text_8(text, libsais_sparseness) }; + + sa = vec![0; packed_text.len()]; + exit_code = + unsafe { libsais64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; + } else if required_bits <= 16 { + // bitpacked values fit in uint16_t + let packed_text = bitpack_text_16(text, libsais_sparseness); + sa = vec![0; packed_text.len()]; + exit_code = + unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; + } else { + let packed_text = bitpack_text_32(text, libsais_sparseness); + sa = vec![0; packed_text.len()]; + let k = 1 << (libsais_sparseness * BITS_PER_CHAR); + exit_code = + unsafe { libsais32x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, k, 0, null_mut()) }; + } + + if exit_code == 0 { + for elem in sa.iter_mut() { + let libsais_sparseness = libsais_sparseness as i64; + *elem *= libsais_sparseness; + } + Ok(sa) + } else { + Err("Failed building suffix array") + } } #[cfg(test)] @@ -25,8 +61,10 @@ mod tests { #[test] fn check_build_sa_with_libsais64() { - let text = "banana$"; - let sa = sais64(text.as_bytes()); - assert_eq!(sa, Some(vec![6, 5, 3, 1, 0, 4, 2])); + let sparseness_factor = 4; + let text = "BANANA-BANANA$".as_bytes().to_vec(); + let sa = sais64(text, sparseness_factor); + let correct_sa: Vec = vec![12, 8, 0, 4]; + assert_eq!(sa, Ok(correct_sa)); } } diff --git a/sa-builder/README.md b/sa-builder/README.md index 5f51b07..870ddb6 100644 --- a/sa-builder/README.md +++ b/sa-builder/README.md @@ -18,7 +18,7 @@ Options: -o, --output Output location where to store the suffix array -s, --sparseness-factor - The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used) [default: 1] + The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used). Internally, a library call will be performed with a maximum sparseness of 5 (because of memory usage). If a higher sparsity is desired, the largest divisor smaller than or equal to 5 is used for the library call. Then, the SA is filtered to achieve the desired sparsity. [default: 1] -a, --construction-algorithm The algorithm used to construct the suffix array (default value LibSais) [default: lib-sais] [possible values: lib-div-suf-sort, lib-sais] -c, --compress-sa diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index c0e13cd..6294a28 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -46,22 +46,50 @@ pub enum SAConstructionAlgorithm { /// /// The errors that occurred during the building of the suffix array itself pub fn build_ssa( - text: &mut Vec, + mut text: Vec, construction_algorithm: &SAConstructionAlgorithm, sparseness_factor: u8 ) -> Result, Box> { // translate all L's to a I - translate_l_to_i(text); + translate_l_to_i(&mut text); // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(text), - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) - } - .ok_or("Building suffix array failed")?; + SAConstructionAlgorithm::LibSais => libsais64(text, sparseness_factor)?, + SAConstructionAlgorithm::LibDivSufSort => { + libdivsufsort_rs::divsufsort64(&text).ok_or("Building suffix array failed")? + } + }; // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) - sample_sa(&mut sa, sparseness_factor); + if *construction_algorithm == SAConstructionAlgorithm::LibDivSufSort { + sample_sa(&mut sa, sparseness_factor); + } + + Ok(sa) +} + +// Max sparseness for libsais because it creates a bucket for each element of the alphabet (2 ^ (sparseness * bits_per_char) buckets). +const MAX_SPARSENESS: usize = 5; +fn libsais64(text: Vec, sparseness_factor: u8) -> Result, &'static str> { + let sparseness_factor = sparseness_factor as usize; + + // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness + // max 28 out of 32 bits used, because a bucket is created for every element of the alfabet 8 * 2^28). + let mut libsais_sparseness = MAX_SPARSENESS; + while sparseness_factor % libsais_sparseness != 0 { + libsais_sparseness -= 1; + } + let sample_rate = sparseness_factor / libsais_sparseness; + eprintln!("\tSparseness factor: {}", sparseness_factor); + eprintln!("\tLibsais sparseness factor: {}", libsais_sparseness); + eprintln!("\tSample rate: {}", sample_rate); + + let mut sa = libsais64_rs::sais64(text, libsais_sparseness)?; + + if sample_rate > 1 { + sample_sa(&mut sa, sample_rate as u8); + } Ok(sa) } @@ -147,42 +175,42 @@ mod tests { #[test] fn test_build_ssa_libsais() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); } #[test] fn test_build_ssa_libsais_empty() { let mut text = b"".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); assert_eq!(sa, vec![]); } #[test] fn test_build_ssa_libsais_sparse() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 2).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 2).unwrap(); assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); } #[test] fn test_build_ssa_libdivsufsort() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); } #[test] fn test_build_ssa_libdivsufsort_empty() { let mut text = b"".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); assert_eq!(sa, vec![]); } #[test] fn test_build_ssa_libdivsufsort_sparse() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap(); assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); } diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 01cc3c4..5ec0483 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -5,7 +5,7 @@ use std::{ }; use clap::Parser; -use sa_builder::{build_ssa, Arguments}; +use sa_builder::{Arguments, build_ssa}; use sa_compression::dump_compressed_suffix_array; use sa_index::binary::dump_suffix_array; use sa_mappings::proteins::Proteins; @@ -21,8 +21,9 @@ fn main() { eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - let mut data = Proteins::try_from_database_file_uncompressed(&database_file) + let data = Proteins::try_from_database_file_uncompressed(&database_file) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + let bits_per_value = (data.len() as f64).log2().ceil() as usize; eprintln!( "✅ Successfully loaded the proteins in {} seconds!", (get_time_ms().unwrap() - start_proteins_time) / 1000.0 @@ -31,14 +32,13 @@ fn main() { eprintln!(); eprintln!("📋 Started building the suffix array..."); let start_ssa_time = get_time_ms().unwrap(); - let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) + let sa = build_ssa(data, &construction_algorithm, sparseness_factor) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!( "✅ Successfully built the suffix array in {} seconds!", (get_time_ms().unwrap() - start_ssa_time) / 1000.0 ); eprintln!("\tAmount of items: {}", sa.len()); - eprintln!("\tSample rate: {}", sparseness_factor); // open the output file let mut file = @@ -49,8 +49,6 @@ fn main() { let start_dump_time = get_time_ms().unwrap(); if compress_sa { - let bits_per_value = (data.len() as f64).log2().ceil() as usize; - if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { eprint_and_exit(err.to_string().as_str()); }; diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index e9952a2..8a107c0 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -3,7 +3,7 @@ use std::{ io::{BufRead, Write} }; -use bitarray::{data_to_writer, Binary, BitArray}; +use bitarray::{Binary, BitArray, data_to_writer}; use sa_index::SuffixArray; /// Writes the compressed suffix array to a writer. diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index dab8577..1b51dc3 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -4,9 +4,9 @@ use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION use text_compression::ProteinTextSlice; use crate::{ + Nullable, SuffixArray, sa_searcher::BoundSearch::{Maximum, Minimum}, - suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}, - Nullable, SuffixArray + suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex} }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -495,9 +495,9 @@ mod tests { use text_compression::ProteinText; use crate::{ + SuffixArray, sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, - suffix_to_protein_index::SparseSuffixToProtein, - SuffixArray + suffix_to_protein_index::SparseSuffixToProtein }; #[test] diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index a6a4e93..6cbcd91 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -112,10 +112,10 @@ mod tests { use text_compression::ProteinText; use crate::{ + Nullable, suffix_to_protein_index::{ DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle - }, - Nullable + } }; fn build_text() -> ProteinText { diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index 1a1cedf..82e1aec 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -6,18 +6,18 @@ use std::{ }; use axum::{ + Json, Router, extract::{DefaultBodyLimit, State}, http::StatusCode, - routing::post, - Json, Router + routing::post }; use clap::Parser; use sa_compression::load_compressed_suffix_array; use sa_index::{ + SuffixArray, binary::load_suffix_array, - peptide_search::{search_all_peptides, SearchResult}, - sa_searcher::SparseSearcher, - SuffixArray + peptide_search::{SearchResult, search_all_peptides}, + sa_searcher::SparseSearcher }; use sa_mappings::proteins::Proteins; use serde::Deserialize; diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index 338e234..4b79f03 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -4,7 +4,7 @@ use std::{ io::{BufRead, Write} }; -use bitarray::{data_to_writer, Binary, BitArray}; +use bitarray::{Binary, BitArray, data_to_writer}; /// Structure representing the proteins, stored in a bit array using 5 bits per amino acid. pub struct ProteinText {