From f1dc185bcff67648273874030ebaa9d63ca08a2e Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Tue, 22 Oct 2024 15:07:00 +0200 Subject: [PATCH] allow all sparseness factors --- libsais64-rs/src/lib.rs | 20 +++----------------- sa-builder/src/lib.rs | 26 +++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index e0ff871..2f95c93 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -17,30 +17,16 @@ pub mod bitpacking; /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { - let sparseness_factor = sparseness_factor as usize; - let mut libsais_sparseness = sparseness_factor; - let mut sa; +pub fn sais64(text: &Vec, libsais_sparseness: usize) -> Result, &str> { let exit_code; + let mut sa; - if sparseness_factor * BITS_PER_CHAR <= 16 { + if libsais_sparseness * BITS_PER_CHAR <= 16 { // bitpacked values fit in uint16_t let packed_text = bitpack_text_16(text, libsais_sparseness); sa = vec![0; packed_text.len()]; exit_code = unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; } else { - // bitpacked values do not fit in uint16_t, use 32-bit text - // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness - // max 28 out of 32 bits used, because a bucket is created for every element of the alfabet 8 * 2^28). - libsais_sparseness = 28 / BITS_PER_CHAR; - while sparseness_factor % libsais_sparseness != 0 && libsais_sparseness * BITS_PER_CHAR > 16 { - libsais_sparseness -= 1; - } - - if libsais_sparseness * BITS_PER_CHAR <= 16 { - return Err("invalid sparseness factor"); - } - let packed_text = bitpack_text_32(text, libsais_sparseness); sa = vec![0; packed_text.len()]; let k = 1 << (libsais_sparseness * BITS_PER_CHAR); diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 2cd4afb..f1fa28e 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -54,7 +54,7 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(&text, sparseness_factor)?, + SAConstructionAlgorithm::LibSais => libsais64(&text, sparseness_factor)?, SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? }; @@ -66,6 +66,30 @@ pub fn build_ssa( Ok(sa) } +const MAX_SPARSENESS: usize = 5; +fn libsais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { + let sparseness_factor = sparseness_factor as usize; + + // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness + // max 28 out of 32 bits used, because a bucket is created for every element of the alfabet 8 * 2^28). + let mut libsais_sparseness = MAX_SPARSENESS; + while sparseness_factor % libsais_sparseness != 0 { + libsais_sparseness -= 1; + } + let sample_rate = sparseness_factor / libsais_sparseness; + println!(" Sparseness factor: {}", sparseness_factor); + println!(" Libsais sparseness factor: {}", libsais_sparseness); + println!(" Sample rate: {}", sample_rate); + + let mut sa = libsais64_rs::sais64(&text, libsais_sparseness)?; + + if sample_rate > 1 { + sample_sa(&mut sa, sample_rate as u8); + } + + Ok(sa) +} + /// Translate all L's to I's in the given text /// /// # Arguments