From c94df8d5de4a1fcf4c234c38e175f20ac75af6f4 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Mon, 14 Oct 2024 10:20:01 +0200 Subject: [PATCH 01/26] SA construction with bitpacking --- libsais64-rs/src/lib.rs | 6 +++--- sa-builder/src/bitpacking.rs | 39 ++++++++++++++++++++++++++++++++++++ sa-builder/src/lib.rs | 15 +++++++++++--- sa-builder/src/main.rs | 7 ++++--- 4 files changed, 58 insertions(+), 9 deletions(-) create mode 100644 sa-builder/src/bitpacking.rs diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index e2a87f6..c2c9369 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -13,15 +13,15 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64(text: &[u8]) -> Option> { +pub fn sais64_long(text: &mut Vec, alphabet_size: i64) -> Option> { let mut sa = vec![0; text.len()]; - let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) }; + let exit_code = unsafe { libsais64_long(text.as_mut_ptr(), sa.as_mut_ptr(), text.len() as i64, alphabet_size, 0) }; if exit_code == 0 { Some(sa) } else { None } } #[cfg(test)] mod tests { - use crate::sais64; + use crate::sais64_long; #[test] fn check_build_sa_with_libsais64() { diff --git a/sa-builder/src/bitpacking.rs b/sa-builder/src/bitpacking.rs new file mode 100644 index 0000000..f53f0fe --- /dev/null +++ b/sa-builder/src/bitpacking.rs @@ -0,0 +1,39 @@ + + +// Function to get the rank of a character +fn get_rank(c: u8) -> u8 { + match c { + b'$' => 0, + b'-' => 1, + _ => 2 + (c - b'A'), + } +} + +const BITS_PER_CHAR: usize = 5; +pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { + let sparseness_factor = sparseness_factor as usize; + let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; + let mut text_packed = vec![0; num_ints]; + + for i in 0..(num_ints-1) { + let ti = i * sparseness_factor; + let mut element = 0i64; + for j in 0..sparseness_factor { + let rank_c = get_rank(text[ti + j]) as i64; + element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + } + text_packed[i] = element; + } + + // Handle the last element + let mut last_element = 0i64; + let last_el_start = sparseness_factor * (num_ints - 1); + for i in 0..((text.len() - 1) % sparseness_factor + 1) { + let rank_c = get_rank(text[last_el_start + i]) as i64; + last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); + } + text_packed[num_ints - 1] = last_element; + + text_packed + +} \ No newline at end of file diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index c0e13cd..2c4871b 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,7 +1,9 @@ use std::error::Error; - +use crate::bitpacking::bitpack_text; use clap::{Parser, ValueEnum}; +pub mod bitpacking; + /// Build a (sparse, compressed) suffix array from the given text #[derive(Parser, Debug)] pub struct Arguments { @@ -55,13 +57,20 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(text), + SAConstructionAlgorithm::LibSais => { + let bits_per_char = 5; + let sparseness_factor = 4; + let mut packed_text = bitpack_text(text, sparseness_factor); + libsais64_rs::sais64_long(&mut packed_text, 1 << (bits_per_char * sparseness_factor)) + }, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) } .ok_or("Building suffix array failed")?; // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) - sample_sa(&mut sa, sparseness_factor); + if *construction_algorithm == SAConstructionAlgorithm::LibDivSufSort { + sample_sa(&mut sa, sparseness_factor); + } Ok(sa) } diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 01cc3c4..35a7e34 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,5 +1,5 @@ use std::{ - fs::{File, OpenOptions}, + fs::{self, File, OpenOptions}, io::BufWriter, time::{SystemTime, SystemTimeError, UNIX_EPOCH} }; @@ -21,8 +21,9 @@ fn main() { eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - let mut data = Proteins::try_from_database_file_uncompressed(&database_file) - .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + /*let mut data = Proteins::try_from_database_file_uncompressed(&database_file) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));*/ + let mut data = fs::read_to_string(database_file).expect("Data file does not exist").into_bytes(); eprintln!( "✅ Successfully loaded the proteins in {} seconds!", (get_time_ms().unwrap() - start_proteins_time) / 1000.0 From 850b2ee0703a48dddd9822d73ca03b0321d3d5d7 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Mon, 14 Oct 2024 10:34:58 +0200 Subject: [PATCH 02/26] use global variable for bits_per_char --- sa-builder/src/bitpacking.rs | 2 +- sa-builder/src/lib.rs | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sa-builder/src/bitpacking.rs b/sa-builder/src/bitpacking.rs index f53f0fe..192d918 100644 --- a/sa-builder/src/bitpacking.rs +++ b/sa-builder/src/bitpacking.rs @@ -9,7 +9,7 @@ fn get_rank(c: u8) -> u8 { } } -const BITS_PER_CHAR: usize = 5; +pub const BITS_PER_CHAR: usize = 5; pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { let sparseness_factor = sparseness_factor as usize; let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 2c4871b..97dcb93 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,5 +1,6 @@ use std::error::Error; use crate::bitpacking::bitpack_text; +use crate::bitpacking::BITS_PER_CHAR; use clap::{Parser, ValueEnum}; pub mod bitpacking; @@ -58,10 +59,8 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => { - let bits_per_char = 5; - let sparseness_factor = 4; let mut packed_text = bitpack_text(text, sparseness_factor); - libsais64_rs::sais64_long(&mut packed_text, 1 << (bits_per_char * sparseness_factor)) + libsais64_rs::sais64_long(&mut packed_text, 1 << (BITS_PER_CHAR * sparseness_factor as usize)) }, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) } From c09b61aedc217062921cefac123a8936aa213770 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Mon, 14 Oct 2024 11:47:03 +0200 Subject: [PATCH 03/26] fixed tests for libsais with bitpacking --- libsais64-rs/src/lib.rs | 22 +++++++++++++++++----- sa-builder/src/bitpacking.rs | 4 ++++ sa-builder/src/lib.rs | 3 ++- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index c2c9369..93d7158 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -13,10 +13,16 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64_long(text: &mut Vec, alphabet_size: i64) -> Option> { +pub fn sais64_long(text: &mut Vec, alphabet_size: i64, sparseness_factor: u8) -> Option> { let mut sa = vec![0; text.len()]; let exit_code = unsafe { libsais64_long(text.as_mut_ptr(), sa.as_mut_ptr(), text.len() as i64, alphabet_size, 0) }; - if exit_code == 0 { Some(sa) } else { None } + if exit_code == 0 { + let sparseness_factor = sparseness_factor as i64; + for elem in sa.iter_mut() { + *elem *= sparseness_factor; + } + Some(sa) + } else { None } } #[cfg(test)] @@ -25,8 +31,14 @@ mod tests { #[test] fn check_build_sa_with_libsais64() { - let text = "banana$"; - let sa = sais64(text.as_bytes()); - assert_eq!(sa, Some(vec![6, 5, 3, 1, 0, 4, 2])); + let bits_per_char = 5; + let sparseness_factor = 4; + let mut text = [100834, // BANA + 493603, // NA-B + 80975, // ANAN + 65536 // A$ + ].to_vec(); + let sa = sais64_long(&mut text, 1 << (bits_per_char * sparseness_factor), sparseness_factor); + assert_eq!(sa, Some(vec![12, 8, 0, 4])); } } diff --git a/sa-builder/src/bitpacking.rs b/sa-builder/src/bitpacking.rs index 192d918..2c1404b 100644 --- a/sa-builder/src/bitpacking.rs +++ b/sa-builder/src/bitpacking.rs @@ -15,6 +15,10 @@ pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; + if text.len() == 0 { + return text_packed; + } + for i in 0..(num_ints-1) { let ti = i * sparseness_factor; let mut element = 0i64; diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 97dcb93..9601f86 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -60,7 +60,8 @@ pub fn build_ssa( let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => { let mut packed_text = bitpack_text(text, sparseness_factor); - libsais64_rs::sais64_long(&mut packed_text, 1 << (BITS_PER_CHAR * sparseness_factor as usize)) + + libsais64_rs::sais64_long(&mut packed_text, 1 << (BITS_PER_CHAR * sparseness_factor as usize), sparseness_factor) }, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) } From 8d72316cca069e03d7df0df5a937686f47331c02 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Mon, 14 Oct 2024 13:46:56 +0200 Subject: [PATCH 04/26] remove debug code --- sa-builder/src/main.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 35a7e34..8b9d221 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -21,9 +21,8 @@ fn main() { eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - /*let mut data = Proteins::try_from_database_file_uncompressed(&database_file) - .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));*/ - let mut data = fs::read_to_string(database_file).expect("Data file does not exist").into_bytes(); + let mut data = Proteins::try_from_database_file_uncompressed(&database_file) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!( "✅ Successfully loaded the proteins in {} seconds!", (get_time_ms().unwrap() - start_proteins_time) / 1000.0 From 1b15fad14478fe34a3f37167323739cc386eb4f0 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Mon, 14 Oct 2024 13:47:54 +0200 Subject: [PATCH 05/26] remove unused import --- sa-builder/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 8b9d221..01cc3c4 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,5 +1,5 @@ use std::{ - fs::{self, File, OpenOptions}, + fs::{File, OpenOptions}, io::BufWriter, time::{SystemTime, SystemTimeError, UNIX_EPOCH} }; From 9053d254ba81f7da1148a9d6e4ac0f9c338159a5 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Mon, 14 Oct 2024 16:09:25 +0200 Subject: [PATCH 06/26] bugfix shift overflow --- sa-builder/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 9601f86..f322486 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -61,7 +61,7 @@ pub fn build_ssa( SAConstructionAlgorithm::LibSais => { let mut packed_text = bitpack_text(text, sparseness_factor); - libsais64_rs::sais64_long(&mut packed_text, 1 << (BITS_PER_CHAR * sparseness_factor as usize), sparseness_factor) + libsais64_rs::sais64_long(&mut packed_text, (1 as i64) << (BITS_PER_CHAR * sparseness_factor as usize), sparseness_factor) }, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) } From 0424cd3d38637719df786ece86f20b78e445d0a5 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Fri, 18 Oct 2024 10:39:41 +0200 Subject: [PATCH 07/26] use adapted libsais library --- libsais64-rs/builder.rs | 2 +- libsais64-rs/libsais-wrapper.h | 4 ++-- libsais64-rs/src/lib.rs | 5 +++-- sa-builder/src/bitpacking.rs | 10 +++++----- sa-builder/src/lib.rs | 5 ++--- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs index 5b3feb2..fe8430e 100644 --- a/libsais64-rs/builder.rs +++ b/libsais64-rs/builder.rs @@ -53,7 +53,7 @@ fn main() -> Result<(), Box> { // clone the c library Command::new("git") - .args(["clone", "https://github.com/IlyaGrebnov/libsais.git", "--depth=1"]) + .args(["clone", "git@github.com:unipept/unipept-libsais.git", "libsais", "--depth=1"]) .status() .expect("Failed to clone the libsais repository"); diff --git a/libsais64-rs/libsais-wrapper.h b/libsais64-rs/libsais-wrapper.h index fbfe0b9..6b4532b 100644 --- a/libsais64-rs/libsais-wrapper.h +++ b/libsais64-rs/libsais-wrapper.h @@ -1,4 +1,4 @@ -#include "libsais/include/libsais64.h" +#include "libsais/include/libsais16x64.h" -int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); \ No newline at end of file +int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); \ No newline at end of file diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index 93d7158..abb5e15 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -2,6 +2,7 @@ #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] +use std::ptr::null_mut; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// Builds the suffix array over the `text` using the libsais64 algorithm @@ -13,9 +14,9 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64_long(text: &mut Vec, alphabet_size: i64, sparseness_factor: u8) -> Option> { +pub fn sais64(text: &Vec, sparseness_factor: u8) -> Option> { let mut sa = vec![0; text.len()]; - let exit_code = unsafe { libsais64_long(text.as_mut_ptr(), sa.as_mut_ptr(), text.len() as i64, alphabet_size, 0) }; + let exit_code = unsafe { libsais16x64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, null_mut()) }; if exit_code == 0 { let sparseness_factor = sparseness_factor as i64; for elem in sa.iter_mut() { diff --git a/sa-builder/src/bitpacking.rs b/sa-builder/src/bitpacking.rs index 2c1404b..486ffd4 100644 --- a/sa-builder/src/bitpacking.rs +++ b/sa-builder/src/bitpacking.rs @@ -10,7 +10,7 @@ fn get_rank(c: u8) -> u8 { } pub const BITS_PER_CHAR: usize = 5; -pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { +pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { let sparseness_factor = sparseness_factor as usize; let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; @@ -21,19 +21,19 @@ pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { for i in 0..(num_ints-1) { let ti = i * sparseness_factor; - let mut element = 0i64; + let mut element = 0u16; for j in 0..sparseness_factor { - let rank_c = get_rank(text[ti + j]) as i64; + let rank_c = get_rank(text[ti + j]) as u16; element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); } text_packed[i] = element; } // Handle the last element - let mut last_element = 0i64; + let mut last_element = 0u16; let last_el_start = sparseness_factor * (num_ints - 1); for i in 0..((text.len() - 1) % sparseness_factor + 1) { - let rank_c = get_rank(text[last_el_start + i]) as i64; + let rank_c = get_rank(text[last_el_start + i]) as u16; last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); } text_packed[num_ints - 1] = last_element; diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index f322486..14b6851 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,6 +1,5 @@ use std::error::Error; use crate::bitpacking::bitpack_text; -use crate::bitpacking::BITS_PER_CHAR; use clap::{Parser, ValueEnum}; pub mod bitpacking; @@ -59,9 +58,9 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => { - let mut packed_text = bitpack_text(text, sparseness_factor); + let packed_text = bitpack_text(text, sparseness_factor); - libsais64_rs::sais64_long(&mut packed_text, (1 as i64) << (BITS_PER_CHAR * sparseness_factor as usize), sparseness_factor) + libsais64_rs::sais64(&packed_text, sparseness_factor) }, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) } From 0ecb79eb825b45d95deaf9711ba8fed155df7cf4 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Tue, 22 Oct 2024 13:36:01 +0200 Subject: [PATCH 08/26] support for sparseness factor up to 6 --- libsais64-rs/libsais-wrapper.h | 4 +- .../src/bitpacking.rs | 36 +++++++++++++++- libsais64-rs/src/lib.rs | 43 ++++++++++++++++--- sa-builder/src/lib.rs | 15 ++----- 4 files changed, 77 insertions(+), 21 deletions(-) rename {sa-builder => libsais64-rs}/src/bitpacking.rs (50%) diff --git a/libsais64-rs/libsais-wrapper.h b/libsais64-rs/libsais-wrapper.h index 6b4532b..eb2cd8d 100644 --- a/libsais64-rs/libsais-wrapper.h +++ b/libsais64-rs/libsais-wrapper.h @@ -1,4 +1,6 @@ #include "libsais/include/libsais16x64.h" -int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); \ No newline at end of file +int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); + +int64_t libsais32x64(const uint32_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t * freq); \ No newline at end of file diff --git a/sa-builder/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs similarity index 50% rename from sa-builder/src/bitpacking.rs rename to libsais64-rs/src/bitpacking.rs index 486ffd4..df1aeed 100644 --- a/sa-builder/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -10,8 +10,8 @@ fn get_rank(c: u8) -> u8 { } pub const BITS_PER_CHAR: usize = 5; -pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { - let sparseness_factor = sparseness_factor as usize; +pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { + let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; @@ -40,4 +40,36 @@ pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { text_packed +} + +pub fn bitpack_text_32(text: &Vec, sparseness_factor: usize) -> Vec { + + let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; + let mut text_packed = vec![0; num_ints]; + + if text.len() == 0 { + return text_packed; + } + + for i in 0..(num_ints-1) { + let ti = i * sparseness_factor; + let mut element = 0u32; + for j in 0..sparseness_factor { + let rank_c = get_rank(text[ti + j]) as u32; + element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + } + text_packed[i] = element; + } + + // Handle the last element + let mut last_element = 0u32; + let last_el_start = sparseness_factor * (num_ints - 1); + for i in 0..((text.len() - 1) % sparseness_factor + 1) { + let rank_c = get_rank(text[last_el_start + i]) as u32; + last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); + } + text_packed[num_ints - 1] = last_element; + + text_packed + } \ No newline at end of file diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index abb5e15..470b56d 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -3,8 +3,11 @@ #![allow(non_camel_case_types)] #![allow(non_snake_case)] use std::ptr::null_mut; +use crate::bitpacking::{bitpack_text_16, bitpack_text_32, BITS_PER_CHAR}; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +pub mod bitpacking; + /// Builds the suffix array over the `text` using the libsais64 algorithm /// /// # Arguments @@ -14,16 +17,42 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64(text: &Vec, sparseness_factor: u8) -> Option> { - let mut sa = vec![0; text.len()]; - let exit_code = unsafe { libsais16x64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, null_mut()) }; +pub fn sais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { + let sparseness_factor = sparseness_factor as usize; + let mut libsais_sparseness = sparseness_factor; + let mut sa; + let exit_code; + + if sparseness_factor * BITS_PER_CHAR <= 16 { + // bitpacked values fit in uint16_t + let packed_text = bitpack_text_16(text, libsais_sparseness); + sa = vec![0; packed_text.len()]; + exit_code = unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; + } else { + // bitpacked values do not fit in uint16_t, use 32-bit text + // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness + libsais_sparseness = 32 / BITS_PER_CHAR; + while sparseness_factor % libsais_sparseness != 0 && libsais_sparseness * BITS_PER_CHAR > 16 { + libsais_sparseness -= 1; + } + + if sparseness_factor % libsais_sparseness != 0 { + return Err("invalid sparseness factor"); + } + + let packed_text = bitpack_text_32(text, libsais_sparseness); + sa = vec![0; packed_text.len()]; + let k = 1 << (libsais_sparseness * BITS_PER_CHAR); + exit_code = unsafe { libsais32x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, k, 0, null_mut()) }; + } + if exit_code == 0 { - let sparseness_factor = sparseness_factor as i64; for elem in sa.iter_mut() { - *elem *= sparseness_factor; + let libsais_sparseness = libsais_sparseness as i64; + *elem *= libsais_sparseness; } - Some(sa) - } else { None } + Ok(sa) + } else { Err("Failed building suffix array") } } #[cfg(test)] diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 14b6851..529e1be 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,9 +1,6 @@ use std::error::Error; -use crate::bitpacking::bitpack_text; use clap::{Parser, ValueEnum}; -pub mod bitpacking; - /// Build a (sparse, compressed) suffix array from the given text #[derive(Parser, Debug)] pub struct Arguments { @@ -57,14 +54,10 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => { - let packed_text = bitpack_text(text, sparseness_factor); - - libsais64_rs::sais64(&packed_text, sparseness_factor) - }, - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) - } - .ok_or("Building suffix array failed")?; + SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(&text, sparseness_factor)?, + SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? + } + ; // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) if *construction_algorithm == SAConstructionAlgorithm::LibDivSufSort { From bba56bb24f194e5139dafc2296f968dd609d17f3 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Tue, 22 Oct 2024 14:18:29 +0200 Subject: [PATCH 09/26] set max alfabet size to 2^28 --- libsais64-rs/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index 470b56d..8b075bd 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -31,7 +31,8 @@ pub fn sais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { } else { // bitpacked values do not fit in uint16_t, use 32-bit text // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness - libsais_sparseness = 32 / BITS_PER_CHAR; + // max 28 out of 32 bits used, because a bucket is created for every element of the alfabet 8 * 2^28). + libsais_sparseness = 28 / BITS_PER_CHAR; while sparseness_factor % libsais_sparseness != 0 && libsais_sparseness * BITS_PER_CHAR > 16 { libsais_sparseness -= 1; } From ae2e5fa456b2218535bedf9ea1741138b00c26a1 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Tue, 22 Oct 2024 14:39:14 +0200 Subject: [PATCH 10/26] bugfix: check if libsais_sparseness big enough --- libsais64-rs/src/lib.rs | 2 +- sa-builder/src/lib.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index 8b075bd..e0ff871 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -37,7 +37,7 @@ pub fn sais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { libsais_sparseness -= 1; } - if sparseness_factor % libsais_sparseness != 0 { + if libsais_sparseness * BITS_PER_CHAR <= 16 { return Err("invalid sparseness factor"); } diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 529e1be..2cd4afb 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -56,8 +56,7 @@ pub fn build_ssa( let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(&text, sparseness_factor)?, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? - } - ; + }; // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) if *construction_algorithm == SAConstructionAlgorithm::LibDivSufSort { From f1dc185bcff67648273874030ebaa9d63ca08a2e Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Tue, 22 Oct 2024 15:07:00 +0200 Subject: [PATCH 11/26] allow all sparseness factors --- libsais64-rs/src/lib.rs | 20 +++----------------- sa-builder/src/lib.rs | 26 +++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index e0ff871..2f95c93 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -17,30 +17,16 @@ pub mod bitpacking; /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { - let sparseness_factor = sparseness_factor as usize; - let mut libsais_sparseness = sparseness_factor; - let mut sa; +pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &str> { let exit_code; + let mut sa; - if sparseness_factor * BITS_PER_CHAR <= 16 { + if libsais_sparseness * BITS_PER_CHAR <= 16 { // bitpacked values fit in uint16_t let packed_text = bitpack_text_16(text, libsais_sparseness); sa = vec![0; packed_text.len()]; exit_code = unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; } else { - // bitpacked values do not fit in uint16_t, use 32-bit text - // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness - // max 28 out of 32 bits used, because a bucket is created for every element of the alfabet 8 * 2^28). - libsais_sparseness = 28 / BITS_PER_CHAR; - while sparseness_factor % libsais_sparseness != 0 && libsais_sparseness * BITS_PER_CHAR > 16 { - libsais_sparseness -= 1; - } - - if libsais_sparseness * BITS_PER_CHAR <= 16 { - return Err("invalid sparseness factor"); - } - let packed_text = bitpack_text_32(text, libsais_sparseness); sa = vec![0; packed_text.len()]; let k = 1 << (libsais_sparseness * BITS_PER_CHAR); diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 2cd4afb..f1fa28e 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -54,7 +54,7 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(&text, sparseness_factor)?, + SAConstructionAlgorithm::LibSais => libsais64(&text, sparseness_factor)?, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? }; @@ -66,6 +66,30 @@ pub fn build_ssa( Ok(sa) } +const MAX_SPARSENESS: usize = 5; +fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { + let sparseness_factor = sparseness_factor as usize; + + // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness + // max 28 out of 32 bits used, because a bucket is created for every element of the alfabet 8 * 2^28). + let mut libsais_sparseness = MAX_SPARSENESS; + while sparseness_factor % libsais_sparseness != 0 { + libsais_sparseness -= 1; + } + let sample_rate = sparseness_factor / libsais_sparseness; + println!(" Sparseness factor: {}", sparseness_factor); + println!(" Libsais sparseness factor: {}", libsais_sparseness); + println!(" Sample rate: {}", sample_rate); + + let mut sa = libsais64_rs::sais64(&text, libsais_sparseness)?; + + if sample_rate > 1 { + sample_sa(&mut sa, sample_rate as u8); + } + + Ok(sa) +} + /// Translate all L's to I's in the given text /// /// # Arguments From 13a6d697e7497c6c69c059fd1354c6f3595c0b01 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Wed, 23 Oct 2024 10:13:34 +0200 Subject: [PATCH 12/26] minor changes --- Cargo.toml | 3 ++- sa-builder/src/main.rs | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fec447c..c53d905 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,8 @@ resolver = "2" members = [ "bitarray", "fa-compression", "libsais64-rs", - "sa-builder", "sa-compression", + "sa-builder", + "sa-compression", "sa-index", "sa-mappings", "sa-server" diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 01cc3c4..0c77c19 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -38,7 +38,6 @@ fn main() { (get_time_ms().unwrap() - start_ssa_time) / 1000.0 ); eprintln!("\tAmount of items: {}", sa.len()); - eprintln!("\tSample rate: {}", sparseness_factor); // open the output file let mut file = From 64e5589cffd22dd3f909378f945879cad7e73604 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Wed, 23 Oct 2024 12:15:48 +0200 Subject: [PATCH 13/26] add support for 8-bit libsais text --- libsais64-rs/libsais-wrapper.h | 4 +++- libsais64-rs/src/bitpacking.rs | 32 ++++++++++++++++++++++++++++++++ libsais64-rs/src/lib.rs | 14 ++++++++++---- sa-builder/src/lib.rs | 6 +++--- 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/libsais64-rs/libsais-wrapper.h b/libsais64-rs/libsais-wrapper.h index eb2cd8d..aa0a21d 100644 --- a/libsais64-rs/libsais-wrapper.h +++ b/libsais64-rs/libsais-wrapper.h @@ -3,4 +3,6 @@ int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); -int64_t libsais32x64(const uint32_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t * freq); \ No newline at end of file +int64_t libsais32x64(const uint32_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t * freq); + +int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); \ No newline at end of file diff --git a/libsais64-rs/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs index df1aeed..9ee002e 100644 --- a/libsais64-rs/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -10,6 +10,38 @@ fn get_rank(c: u8) -> u8 { } pub const BITS_PER_CHAR: usize = 5; +pub fn bitpack_text_8(text: &Vec, sparseness_factor: usize) -> Vec { + + let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; + let mut text_packed = vec![0; num_ints]; + + if text.len() == 0 { + return text_packed; + } + + for i in 0..(num_ints-1) { + let ti = i * sparseness_factor; + let mut element = 0u8; + for j in 0..sparseness_factor { + let rank_c = get_rank(text[ti + j]); + element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + } + text_packed[i] = element; + } + + // Handle the last element + let mut last_element = 0u8; + let last_el_start = sparseness_factor * (num_ints - 1); + for i in 0..((text.len() - 1) % sparseness_factor + 1) { + let rank_c = get_rank(text[last_el_start + i]); + last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); + } + text_packed[num_ints - 1] = last_element; + + text_packed + +} + pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index 2f95c93..f6b6783 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -3,7 +3,7 @@ #![allow(non_camel_case_types)] #![allow(non_snake_case)] use std::ptr::null_mut; -use crate::bitpacking::{bitpack_text_16, bitpack_text_32, BITS_PER_CHAR}; +use crate::bitpacking::{bitpack_text_16, bitpack_text_32, bitpack_text_8, BITS_PER_CHAR}; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); pub mod bitpacking; @@ -21,7 +21,13 @@ pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &st let exit_code; let mut sa; - if libsais_sparseness * BITS_PER_CHAR <= 16 { + let required_bits = libsais_sparseness * BITS_PER_CHAR; + if required_bits <= 8 { + // bitpacked values fit in uint8_t + let packed_text = bitpack_text_8(text, libsais_sparseness); + sa = vec![0; packed_text.len()]; + exit_code = unsafe { libsais64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; + } else if required_bits <= 16 { // bitpacked values fit in uint16_t let packed_text = bitpack_text_16(text, libsais_sparseness); sa = vec![0; packed_text.len()]; @@ -44,7 +50,7 @@ pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &st #[cfg(test)] mod tests { - use crate::sais64_long; + use crate::sais64; #[test] fn check_build_sa_with_libsais64() { @@ -55,7 +61,7 @@ mod tests { 80975, // ANAN 65536 // A$ ].to_vec(); - let sa = sais64_long(&mut text, 1 << (bits_per_char * sparseness_factor), sparseness_factor); + let sa = sais64(&mut text, sparseness_factor); assert_eq!(sa, Some(vec![12, 8, 0, 4])); } } diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index f1fa28e..815b8d6 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -77,9 +77,9 @@ fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { libsais_sparseness -= 1; } let sample_rate = sparseness_factor / libsais_sparseness; - println!(" Sparseness factor: {}", sparseness_factor); - println!(" Libsais sparseness factor: {}", libsais_sparseness); - println!(" Sample rate: {}", sample_rate); + eprintln!("\tSparseness factor: {}", sparseness_factor); + eprintln!("\tLibsais sparseness factor: {}", libsais_sparseness); + eprintln!("\tSample rate: {}", sample_rate); let mut sa = libsais64_rs::sais64(&text, libsais_sparseness)?; From aa6ef9ffbc4682ebc5cd33d1beb641bc4d20ee7e Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Thu, 24 Oct 2024 11:17:08 +0200 Subject: [PATCH 14/26] add comments + do not bitpack text sparseness is 1 --- libsais64-rs/src/bitpacking.rs | 8 ++++++++ libsais64-rs/src/lib.rs | 14 +++++--------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/libsais64-rs/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs index 9ee002e..d6d4cc8 100644 --- a/libsais64-rs/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -9,8 +9,12 @@ fn get_rank(c: u8) -> u8 { } } +// Amount of bits necessary to represent one character in the protein text. pub const BITS_PER_CHAR: usize = 5; + +// Bitpack text in a vector of u8 elements. BITS_PER_CHAR * sparseness_factor <= 8. pub fn bitpack_text_8(text: &Vec, sparseness_factor: usize) -> Vec { + assert!(BITS_PER_CHAR * sparseness_factor <= 8); let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; @@ -42,7 +46,9 @@ pub fn bitpack_text_8(text: &Vec, sparseness_factor: usize) -> Vec { } +// Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 16. pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { + assert!(BITS_PER_CHAR * sparseness_factor <= 16); let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; @@ -74,7 +80,9 @@ pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { } +// Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 32. pub fn bitpack_text_32(text: &Vec, sparseness_factor: usize) -> Vec { + assert!(BITS_PER_CHAR * sparseness_factor <= 32); let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index f6b6783..d9eca5a 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -8,7 +8,7 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); pub mod bitpacking; -/// Builds the suffix array over the `text` using the libsais64 algorithm +/// Builds the suffix array over the `text` using the libsais algorithm /// /// # Arguments /// * `text` - The text used for suffix array construction @@ -24,7 +24,7 @@ pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &st let required_bits = libsais_sparseness * BITS_PER_CHAR; if required_bits <= 8 { // bitpacked values fit in uint8_t - let packed_text = bitpack_text_8(text, libsais_sparseness); + let packed_text = if libsais_sparseness == 1 { text } else { &bitpack_text_8(text, libsais_sparseness) }; sa = vec![0; packed_text.len()]; exit_code = unsafe { libsais64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; } else if required_bits <= 16 { @@ -54,14 +54,10 @@ mod tests { #[test] fn check_build_sa_with_libsais64() { - let bits_per_char = 5; let sparseness_factor = 4; - let mut text = [100834, // BANA - 493603, // NA-B - 80975, // ANAN - 65536 // A$ - ].to_vec(); + let mut text = "BANANA-BANANA$".as_bytes().to_vec(); let sa = sais64(&mut text, sparseness_factor); - assert_eq!(sa, Some(vec![12, 8, 0, 4])); + let correct_sa: Vec = vec![12, 8, 0, 4]; + assert_eq!(sa, Ok(correct_sa)); } } From b9064e61289e5e882b8b7a9ab94c850854590559 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Thu, 24 Oct 2024 11:27:50 +0200 Subject: [PATCH 15/26] add comments --- sa-builder/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 815b8d6..424b306 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -66,6 +66,7 @@ pub fn build_ssa( Ok(sa) } +// Max sparseness for libsais because it creates a bucket for each element of the alphabet (2 ^ (sparseness * bits_per_char) buckets). const MAX_SPARSENESS: usize = 5; fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { let sparseness_factor = sparseness_factor as usize; From 944adc297833079abaf30caa473bb7b04be9cdb8 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Thu, 24 Oct 2024 11:36:24 +0200 Subject: [PATCH 16/26] add comment --- libsais64-rs/src/bitpacking.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libsais64-rs/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs index d6d4cc8..ea6578e 100644 --- a/libsais64-rs/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -80,7 +80,7 @@ pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { } -// Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 32. +// Bitpack text in a vector of u32 elements. BITS_PER_CHAR * sparseness_factor <= 32. pub fn bitpack_text_32(text: &Vec, sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 32); From 6dccb19275ed0a99e164ee467baafbfbb611d2c8 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Thu, 24 Oct 2024 11:48:00 +0200 Subject: [PATCH 17/26] run clippy --- libsais64-rs/src/bitpacking.rs | 33 +++++++++++++++------------------ sa-builder/src/lib.rs | 4 ++-- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/libsais64-rs/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs index ea6578e..d3fb910 100644 --- a/libsais64-rs/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -13,24 +13,23 @@ fn get_rank(c: u8) -> u8 { pub const BITS_PER_CHAR: usize = 5; // Bitpack text in a vector of u8 elements. BITS_PER_CHAR * sparseness_factor <= 8. -pub fn bitpack_text_8(text: &Vec, sparseness_factor: usize) -> Vec { +pub fn bitpack_text_8(text: &[u8], sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 8); let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; - if text.len() == 0 { + if text.is_empty() { return text_packed; } - for i in 0..(num_ints-1) { + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints-1) { let ti = i * sparseness_factor; - let mut element = 0u8; + *element = 0u8; for j in 0..sparseness_factor { let rank_c = get_rank(text[ti + j]); - element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + *element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); } - text_packed[i] = element; } // Handle the last element @@ -47,24 +46,23 @@ pub fn bitpack_text_8(text: &Vec, sparseness_factor: usize) -> Vec { } // Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 16. -pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { +pub fn bitpack_text_16(text: &[u8], sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 16); let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; - if text.len() == 0 { + if text.is_empty() { return text_packed; } - for i in 0..(num_ints-1) { + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints-1) { let ti = i * sparseness_factor; - let mut element = 0u16; + *element = 0u16; for j in 0..sparseness_factor { let rank_c = get_rank(text[ti + j]) as u16; - element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + *element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); } - text_packed[i] = element; } // Handle the last element @@ -81,24 +79,23 @@ pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { } // Bitpack text in a vector of u32 elements. BITS_PER_CHAR * sparseness_factor <= 32. -pub fn bitpack_text_32(text: &Vec, sparseness_factor: usize) -> Vec { +pub fn bitpack_text_32(text: &[u8], sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 32); let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; - if text.len() == 0 { + if text.is_empty() { return text_packed; } - for i in 0..(num_ints-1) { + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints-1) { let ti = i * sparseness_factor; - let mut element = 0u32; + *element = 0u32; for j in 0..sparseness_factor { let rank_c = get_rank(text[ti + j]) as u32; - element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + *element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); } - text_packed[i] = element; } // Handle the last element diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 424b306..1ee7199 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -54,7 +54,7 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => libsais64(&text, sparseness_factor)?, + SAConstructionAlgorithm::LibSais => libsais64(text, sparseness_factor)?, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? }; @@ -82,7 +82,7 @@ fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { eprintln!("\tLibsais sparseness factor: {}", libsais_sparseness); eprintln!("\tSample rate: {}", sample_rate); - let mut sa = libsais64_rs::sais64(&text, libsais_sparseness)?; + let mut sa = libsais64_rs::sais64(text, libsais_sparseness)?; if sample_rate > 1 { sample_sa(&mut sa, sample_rate as u8); From d2feb52bf0d80fe303155bc5b95d4cfa1c769238 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Thu, 24 Oct 2024 12:15:22 +0200 Subject: [PATCH 18/26] cargo fmt --- fa-compression/benches/algorithm2/decode.rs | 2 +- fa-compression/benches/algorithm2/encode.rs | 2 +- fa-compression/benches/util.rs | 2 +- libsais64-rs/src/bitpacking.rs | 21 ++++++++------------- libsais64-rs/src/lib.rs | 20 +++++++++++++------- sa-builder/src/lib.rs | 5 ++++- sa-builder/src/main.rs | 2 +- sa-compression/src/lib.rs | 2 +- sa-index/src/sa_searcher.rs | 8 ++++---- sa-index/src/suffix_to_protein_index.rs | 4 ++-- sa-server/src/main.rs | 10 +++++----- text-compression/src/lib.rs | 2 +- 12 files changed, 42 insertions(+), 38 deletions(-) diff --git a/fa-compression/benches/algorithm2/decode.rs b/fa-compression/benches/algorithm2/decode.rs index 4d562fc..bc14527 100644 --- a/fa-compression/benches/algorithm2/decode.rs +++ b/fa-compression/benches/algorithm2/decode.rs @@ -1,5 +1,5 @@ use criterion::black_box; -use fa_compression::algorithm2::{decode, encode, CompressionTable}; +use fa_compression::algorithm2::{CompressionTable, decode, encode}; use super::util::generate_annotation; diff --git a/fa-compression/benches/algorithm2/encode.rs b/fa-compression/benches/algorithm2/encode.rs index 827dd50..406487f 100644 --- a/fa-compression/benches/algorithm2/encode.rs +++ b/fa-compression/benches/algorithm2/encode.rs @@ -1,5 +1,5 @@ use criterion::black_box; -use fa_compression::algorithm2::{encode, CompressionTable}; +use fa_compression::algorithm2::{CompressionTable, encode}; use super::util::generate_annotation; diff --git a/fa-compression/benches/util.rs b/fa-compression/benches/util.rs index b6ddd9a..fee2359 100644 --- a/fa-compression/benches/util.rs +++ b/fa-compression/benches/util.rs @@ -1,4 +1,4 @@ -use rand::{rngs::ThreadRng, Rng}; +use rand::{Rng, rngs::ThreadRng}; /// Generate a random InterPro annotation. pub fn generate_ipr(random: &mut ThreadRng) -> String { diff --git a/libsais64-rs/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs index d3fb910..661589c 100644 --- a/libsais64-rs/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -1,11 +1,9 @@ - - // Function to get the rank of a character fn get_rank(c: u8) -> u8 { match c { b'$' => 0, b'-' => 1, - _ => 2 + (c - b'A'), + _ => 2 + (c - b'A') } } @@ -16,14 +14,14 @@ pub const BITS_PER_CHAR: usize = 5; pub fn bitpack_text_8(text: &[u8], sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 8); - let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; + let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; if text.is_empty() { return text_packed; } - for (i, element) in text_packed.iter_mut().enumerate().take(num_ints-1) { + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) { let ti = i * sparseness_factor; *element = 0u8; for j in 0..sparseness_factor { @@ -42,21 +40,20 @@ pub fn bitpack_text_8(text: &[u8], sparseness_factor: usize) -> Vec { text_packed[num_ints - 1] = last_element; text_packed - } // Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 16. pub fn bitpack_text_16(text: &[u8], sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 16); - let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; + let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; if text.is_empty() { return text_packed; } - for (i, element) in text_packed.iter_mut().enumerate().take(num_ints-1) { + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) { let ti = i * sparseness_factor; *element = 0u16; for j in 0..sparseness_factor { @@ -75,21 +72,20 @@ pub fn bitpack_text_16(text: &[u8], sparseness_factor: usize) -> Vec { text_packed[num_ints - 1] = last_element; text_packed - } // Bitpack text in a vector of u32 elements. BITS_PER_CHAR * sparseness_factor <= 32. pub fn bitpack_text_32(text: &[u8], sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 32); - let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; + let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; if text.is_empty() { return text_packed; } - for (i, element) in text_packed.iter_mut().enumerate().take(num_ints-1) { + for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) { let ti = i * sparseness_factor; *element = 0u32; for j in 0..sparseness_factor { @@ -108,5 +104,4 @@ pub fn bitpack_text_32(text: &[u8], sparseness_factor: usize) -> Vec { text_packed[num_ints - 1] = last_element; text_packed - -} \ No newline at end of file +} diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index d9eca5a..13b4c55 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -3,7 +3,8 @@ #![allow(non_camel_case_types)] #![allow(non_snake_case)] use std::ptr::null_mut; -use crate::bitpacking::{bitpack_text_16, bitpack_text_32, bitpack_text_8, BITS_PER_CHAR}; + +use crate::bitpacking::{BITS_PER_CHAR, bitpack_text_8, bitpack_text_16, bitpack_text_32}; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); pub mod bitpacking; @@ -26,26 +27,31 @@ pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &st // bitpacked values fit in uint8_t let packed_text = if libsais_sparseness == 1 { text } else { &bitpack_text_8(text, libsais_sparseness) }; sa = vec![0; packed_text.len()]; - exit_code = unsafe { libsais64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; + exit_code = + unsafe { libsais64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; } else if required_bits <= 16 { // bitpacked values fit in uint16_t let packed_text = bitpack_text_16(text, libsais_sparseness); sa = vec![0; packed_text.len()]; - exit_code = unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; + exit_code = + unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; } else { let packed_text = bitpack_text_32(text, libsais_sparseness); sa = vec![0; packed_text.len()]; let k = 1 << (libsais_sparseness * BITS_PER_CHAR); - exit_code = unsafe { libsais32x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, k, 0, null_mut()) }; + exit_code = + unsafe { libsais32x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, k, 0, null_mut()) }; } - + if exit_code == 0 { for elem in sa.iter_mut() { let libsais_sparseness = libsais_sparseness as i64; *elem *= libsais_sparseness; } - Ok(sa) - } else { Err("Failed building suffix array") } + Ok(sa) + } else { + Err("Failed building suffix array") + } } #[cfg(test)] diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 1ee7199..50bfe06 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,4 +1,5 @@ use std::error::Error; + use clap::{Parser, ValueEnum}; /// Build a (sparse, compressed) suffix array from the given text @@ -55,7 +56,9 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => libsais64(text, sparseness_factor)?, - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? + SAConstructionAlgorithm::LibDivSufSort => { + libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? + } }; // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 0c77c19..0fa10f1 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -5,7 +5,7 @@ use std::{ }; use clap::Parser; -use sa_builder::{build_ssa, Arguments}; +use sa_builder::{Arguments, build_ssa}; use sa_compression::dump_compressed_suffix_array; use sa_index::binary::dump_suffix_array; use sa_mappings::proteins::Proteins; diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index e9952a2..8a107c0 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -3,7 +3,7 @@ use std::{ io::{BufRead, Write} }; -use bitarray::{data_to_writer, Binary, BitArray}; +use bitarray::{Binary, BitArray, data_to_writer}; use sa_index::SuffixArray; /// Writes the compressed suffix array to a writer. diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index dab8577..1b51dc3 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -4,9 +4,9 @@ use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION use text_compression::ProteinTextSlice; use crate::{ + Nullable, SuffixArray, sa_searcher::BoundSearch::{Maximum, Minimum}, - suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}, - Nullable, SuffixArray + suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex} }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -495,9 +495,9 @@ mod tests { use text_compression::ProteinText; use crate::{ + SuffixArray, sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, - suffix_to_protein_index::SparseSuffixToProtein, - SuffixArray + suffix_to_protein_index::SparseSuffixToProtein }; #[test] diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index a6a4e93..6cbcd91 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -112,10 +112,10 @@ mod tests { use text_compression::ProteinText; use crate::{ + Nullable, suffix_to_protein_index::{ DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle - }, - Nullable + } }; fn build_text() -> ProteinText { diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index 1a1cedf..82e1aec 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -6,18 +6,18 @@ use std::{ }; use axum::{ + Json, Router, extract::{DefaultBodyLimit, State}, http::StatusCode, - routing::post, - Json, Router + routing::post }; use clap::Parser; use sa_compression::load_compressed_suffix_array; use sa_index::{ + SuffixArray, binary::load_suffix_array, - peptide_search::{search_all_peptides, SearchResult}, - sa_searcher::SparseSearcher, - SuffixArray + peptide_search::{SearchResult, search_all_peptides}, + sa_searcher::SparseSearcher }; use sa_mappings::proteins::Proteins; use serde::Deserialize; diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index 338e234..4b79f03 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -4,7 +4,7 @@ use std::{ io::{BufRead, Write} }; -use bitarray::{data_to_writer, Binary, BitArray}; +use bitarray::{Binary, BitArray, data_to_writer}; /// Structure representing the proteins, stored in a bit array using 5 bits per amino acid. pub struct ProteinText { From b91a7597b0dfee0a65148a49556397cf87cef747 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Fri, 25 Oct 2024 09:26:39 +0200 Subject: [PATCH 19/26] allow sparseness 6 for libsais --- sa-builder/README.md | 2 +- sa-builder/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sa-builder/README.md b/sa-builder/README.md index 5f51b07..870ddb6 100644 --- a/sa-builder/README.md +++ b/sa-builder/README.md @@ -18,7 +18,7 @@ Options: -o, --output Output location where to store the suffix array -s, --sparseness-factor - The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used) [default: 1] + The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used). Internally, a library call will be performed with a maximum sparseness of 5 (because of memory usage). If a higher sparsity is desired, the largest divisor smaller than or equal to 5 is used for the library call. Then, the SA is filtered to achieve the desired sparsity. [default: 1] -a, --construction-algorithm The algorithm used to construct the suffix array (default value LibSais) [default: lib-sais] [possible values: lib-div-suf-sort, lib-sais] -c, --compress-sa diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 50bfe06..92c4b09 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -70,7 +70,7 @@ pub fn build_ssa( } // Max sparseness for libsais because it creates a bucket for each element of the alphabet (2 ^ (sparseness * bits_per_char) buckets). -const MAX_SPARSENESS: usize = 5; +const MAX_SPARSENESS: usize = 6; fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { let sparseness_factor = sparseness_factor as usize; From d436dfdda0dad2609cef72e3446cd89d7f00e683 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Fri, 25 Oct 2024 10:22:37 +0200 Subject: [PATCH 20/26] keep bitpacked text owned before use --- libsais64-rs/src/lib.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index 13b4c55..68a07cf 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -25,7 +25,14 @@ pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &st let required_bits = libsais_sparseness * BITS_PER_CHAR; if required_bits <= 8 { // bitpacked values fit in uint8_t - let packed_text = if libsais_sparseness == 1 { text } else { &bitpack_text_8(text, libsais_sparseness) }; + let packed_text_data; + let packed_text = if libsais_sparseness == 1 { + text + } else { + packed_text_data = bitpack_text_8(text, libsais_sparseness); + &packed_text_data + }; + sa = vec![0; packed_text.len()]; exit_code = unsafe { libsais64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; From 4fd936ffa01cc7a06bf9ba465d399eb76471569d Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Fri, 25 Oct 2024 10:33:09 +0200 Subject: [PATCH 21/26] set max sparseness to 5 --- sa-builder/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 92c4b09..50bfe06 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -70,7 +70,7 @@ pub fn build_ssa( } // Max sparseness for libsais because it creates a bucket for each element of the alphabet (2 ^ (sparseness * bits_per_char) buckets). -const MAX_SPARSENESS: usize = 6; +const MAX_SPARSENESS: usize = 5; fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { let sparseness_factor = sparseness_factor as usize; From 0fc7431b5bfdc6f4fac9834dc05d4b5995b7cfc3 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Fri, 25 Oct 2024 16:34:29 +0200 Subject: [PATCH 22/26] take ownership of text when bitpacking, don't keep both texts in memory --- libsais64-rs/src/bitpacking.rs | 6 +++--- libsais64-rs/src/lib.rs | 10 ++++------ sa-builder/src/lib.rs | 8 ++++---- sa-builder/src/main.rs | 7 +++---- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/libsais64-rs/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs index 661589c..ba601a6 100644 --- a/libsais64-rs/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -11,7 +11,7 @@ fn get_rank(c: u8) -> u8 { pub const BITS_PER_CHAR: usize = 5; // Bitpack text in a vector of u8 elements. BITS_PER_CHAR * sparseness_factor <= 8. -pub fn bitpack_text_8(text: &[u8], sparseness_factor: usize) -> Vec { +pub fn bitpack_text_8(text: Vec, sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 8); let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; @@ -43,7 +43,7 @@ pub fn bitpack_text_8(text: &[u8], sparseness_factor: usize) -> Vec { } // Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 16. -pub fn bitpack_text_16(text: &[u8], sparseness_factor: usize) -> Vec { +pub fn bitpack_text_16(text: Vec, sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 16); let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; @@ -75,7 +75,7 @@ pub fn bitpack_text_16(text: &[u8], sparseness_factor: usize) -> Vec { } // Bitpack text in a vector of u32 elements. BITS_PER_CHAR * sparseness_factor <= 32. -pub fn bitpack_text_32(text: &[u8], sparseness_factor: usize) -> Vec { +pub fn bitpack_text_32(text: Vec, sparseness_factor: usize) -> Vec { assert!(BITS_PER_CHAR * sparseness_factor <= 32); let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor; diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index 68a07cf..a344c6c 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -18,19 +18,17 @@ pub mod bitpacking; /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &str> { +pub fn sais64(text: Vec, libsais_sparseness: usize) -> Result, &'static str> { let exit_code; let mut sa; let required_bits = libsais_sparseness * BITS_PER_CHAR; if required_bits <= 8 { // bitpacked values fit in uint8_t - let packed_text_data; let packed_text = if libsais_sparseness == 1 { text } else { - packed_text_data = bitpack_text_8(text, libsais_sparseness); - &packed_text_data + bitpack_text_8(text, libsais_sparseness) }; sa = vec![0; packed_text.len()]; @@ -68,8 +66,8 @@ mod tests { #[test] fn check_build_sa_with_libsais64() { let sparseness_factor = 4; - let mut text = "BANANA-BANANA$".as_bytes().to_vec(); - let sa = sais64(&mut text, sparseness_factor); + let text = "BANANA-BANANA$".as_bytes().to_vec(); + let sa = sais64(text, sparseness_factor); let correct_sa: Vec = vec![12, 8, 0, 4]; assert_eq!(sa, Ok(correct_sa)); } diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 50bfe06..47a4aef 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -46,18 +46,18 @@ pub enum SAConstructionAlgorithm { /// /// The errors that occurred during the building of the suffix array itself pub fn build_ssa( - text: &mut Vec, + mut text: Vec, construction_algorithm: &SAConstructionAlgorithm, sparseness_factor: u8 ) -> Result, Box> { // translate all L's to a I - translate_l_to_i(text); + translate_l_to_i(&mut text); // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => libsais64(text, sparseness_factor)?, SAConstructionAlgorithm::LibDivSufSort => { - libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? + libdivsufsort_rs::divsufsort64(&mut text).ok_or("Building suffix array failed")? } }; @@ -71,7 +71,7 @@ pub fn build_ssa( // Max sparseness for libsais because it creates a bucket for each element of the alphabet (2 ^ (sparseness * bits_per_char) buckets). const MAX_SPARSENESS: usize = 5; -fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { +fn libsais64(text: Vec, sparseness_factor: u8) -> Result, &'static str> { let sparseness_factor = sparseness_factor as usize; // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 0fa10f1..5ec0483 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -21,8 +21,9 @@ fn main() { eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - let mut data = Proteins::try_from_database_file_uncompressed(&database_file) + let data = Proteins::try_from_database_file_uncompressed(&database_file) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + let bits_per_value = (data.len() as f64).log2().ceil() as usize; eprintln!( "✅ Successfully loaded the proteins in {} seconds!", (get_time_ms().unwrap() - start_proteins_time) / 1000.0 @@ -31,7 +32,7 @@ fn main() { eprintln!(); eprintln!("📋 Started building the suffix array..."); let start_ssa_time = get_time_ms().unwrap(); - let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) + let sa = build_ssa(data, &construction_algorithm, sparseness_factor) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!( "✅ Successfully built the suffix array in {} seconds!", @@ -48,8 +49,6 @@ fn main() { let start_dump_time = get_time_ms().unwrap(); if compress_sa { - let bits_per_value = (data.len() as f64).log2().ceil() as usize; - if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { eprint_and_exit(err.to_string().as_str()); }; From bfd906982b4168eac3a57bc6714fa4579bb2c999 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Fri, 25 Oct 2024 16:48:59 +0200 Subject: [PATCH 23/26] adapt tests to pass text not as reference --- sa-builder/src/lib.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 47a4aef..5cc8140 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -175,42 +175,42 @@ mod tests { #[test] fn test_build_ssa_libsais() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); } #[test] fn test_build_ssa_libsais_empty() { let mut text = b"".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); assert_eq!(sa, vec![]); } #[test] fn test_build_ssa_libsais_sparse() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 2).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 2).unwrap(); assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); } #[test] fn test_build_ssa_libdivsufsort() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); } #[test] fn test_build_ssa_libdivsufsort_empty() { let mut text = b"".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); assert_eq!(sa, vec![]); } #[test] fn test_build_ssa_libdivsufsort_sparse() { let mut text = b"ABRACADABRA$".to_vec(); - let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap(); + let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap(); assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); } From 1ff2e0fe2b6ad6abb451921c306a56ab02a164c5 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Mon, 28 Oct 2024 09:04:30 +0100 Subject: [PATCH 24/26] cargo fmt --- libsais64-rs/src/lib.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index a344c6c..8c9011c 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -25,11 +25,7 @@ pub fn sais64(text: Vec, libsais_sparseness: usize) -> Result, &'st let required_bits = libsais_sparseness * BITS_PER_CHAR; if required_bits <= 8 { // bitpacked values fit in uint8_t - let packed_text = if libsais_sparseness == 1 { - text - } else { - bitpack_text_8(text, libsais_sparseness) - }; + let packed_text = if libsais_sparseness == 1 { text } else { bitpack_text_8(text, libsais_sparseness) }; sa = vec![0; packed_text.len()]; exit_code = From 0879f4ec0dcdf898f4b7dbb39a802848f3116d3b Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Mon, 28 Oct 2024 09:40:25 +0100 Subject: [PATCH 25/26] cargo clippy --- sa-builder/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 5cc8140..6294a28 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -57,7 +57,7 @@ pub fn build_ssa( let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => libsais64(text, sparseness_factor)?, SAConstructionAlgorithm::LibDivSufSort => { - libdivsufsort_rs::divsufsort64(&mut text).ok_or("Building suffix array failed")? + libdivsufsort_rs::divsufsort64(&text).ok_or("Building suffix array failed")? } }; From 1cd0465bea8a2c6af74fe39cffe32d8142e0ef66 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver <99675815+SimonVandeVyver@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:37:08 +0100 Subject: [PATCH 26/26] change link to unipept-libsais to https --- libsais64-rs/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs index fe8430e..3d52f03 100644 --- a/libsais64-rs/builder.rs +++ b/libsais64-rs/builder.rs @@ -53,7 +53,7 @@ fn main() -> Result<(), Box> { // clone the c library Command::new("git") - .args(["clone", "git@github.com:unipept/unipept-libsais.git", "libsais", "--depth=1"]) + .args(["clone", "https://github.com/unipept/unipept-libsais.git", "libsais", "--depth=1"]) .status() .expect("Failed to clone the libsais repository");