From 0ecb79eb825b45d95deaf9711ba8fed155df7cf4 Mon Sep 17 00:00:00 2001 From: Simon Van de Vyver Date: Tue, 22 Oct 2024 13:36:01 +0200 Subject: [PATCH] support for sparseness factor up to 6 --- libsais64-rs/libsais-wrapper.h | 4 +- .../src/bitpacking.rs | 36 +++++++++++++++- libsais64-rs/src/lib.rs | 43 ++++++++++++++++--- sa-builder/src/lib.rs | 15 ++----- 4 files changed, 77 insertions(+), 21 deletions(-) rename {sa-builder => libsais64-rs}/src/bitpacking.rs (50%) diff --git a/libsais64-rs/libsais-wrapper.h b/libsais64-rs/libsais-wrapper.h index 6b4532b..eb2cd8d 100644 --- a/libsais64-rs/libsais-wrapper.h +++ b/libsais64-rs/libsais-wrapper.h @@ -1,4 +1,6 @@ #include "libsais/include/libsais16x64.h" -int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); \ No newline at end of file +int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); + +int64_t libsais32x64(const uint32_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t * freq); \ No newline at end of file diff --git a/sa-builder/src/bitpacking.rs b/libsais64-rs/src/bitpacking.rs similarity index 50% rename from sa-builder/src/bitpacking.rs rename to libsais64-rs/src/bitpacking.rs index 486ffd4..df1aeed 100644 --- a/sa-builder/src/bitpacking.rs +++ b/libsais64-rs/src/bitpacking.rs @@ -10,8 +10,8 @@ fn get_rank(c: u8) -> u8 { } pub const BITS_PER_CHAR: usize = 5; -pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { - let sparseness_factor = sparseness_factor as usize; +pub fn bitpack_text_16(text: &Vec, sparseness_factor: usize) -> Vec { + let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; let mut text_packed = vec![0; num_ints]; @@ -40,4 +40,36 @@ pub fn bitpack_text(text: &Vec, sparseness_factor: u8) -> Vec { text_packed +} + +pub fn bitpack_text_32(text: &Vec, sparseness_factor: usize) -> Vec { + + let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor; + let mut text_packed = vec![0; num_ints]; + + if text.len() == 0 { + return text_packed; + } + + for i in 0..(num_ints-1) { + let ti = i * sparseness_factor; + let mut element = 0u32; + for j in 0..sparseness_factor { + let rank_c = get_rank(text[ti + j]) as u32; + element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j)); + } + text_packed[i] = element; + } + + // Handle the last element + let mut last_element = 0u32; + let last_el_start = sparseness_factor * (num_ints - 1); + for i in 0..((text.len() - 1) % sparseness_factor + 1) { + let rank_c = get_rank(text[last_el_start + i]) as u32; + last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i)); + } + text_packed[num_ints - 1] = last_element; + + text_packed + } \ No newline at end of file diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index abb5e15..470b56d 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -3,8 +3,11 @@ #![allow(non_camel_case_types)] #![allow(non_snake_case)] use std::ptr::null_mut; +use crate::bitpacking::{bitpack_text_16, bitpack_text_32, BITS_PER_CHAR}; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +pub mod bitpacking; + /// Builds the suffix array over the `text` using the libsais64 algorithm /// /// # Arguments @@ -14,16 +17,42 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// /// Returns Some with the suffix array build over the text if construction succeeds /// Returns None if construction of the suffix array failed -pub fn sais64(text: &Vec, sparseness_factor: u8) -> Option> { - let mut sa = vec![0; text.len()]; - let exit_code = unsafe { libsais16x64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, null_mut()) }; +pub fn sais64(text: &Vec, sparseness_factor: u8) -> Result, &str> { + let sparseness_factor = sparseness_factor as usize; + let mut libsais_sparseness = sparseness_factor; + let mut sa; + let exit_code; + + if sparseness_factor * BITS_PER_CHAR <= 16 { + // bitpacked values fit in uint16_t + let packed_text = bitpack_text_16(text, libsais_sparseness); + sa = vec![0; packed_text.len()]; + exit_code = unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) }; + } else { + // bitpacked values do not fit in uint16_t, use 32-bit text + // set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness + libsais_sparseness = 32 / BITS_PER_CHAR; + while sparseness_factor % libsais_sparseness != 0 && libsais_sparseness * BITS_PER_CHAR > 16 { + libsais_sparseness -= 1; + } + + if sparseness_factor % libsais_sparseness != 0 { + return Err("invalid sparseness factor"); + } + + let packed_text = bitpack_text_32(text, libsais_sparseness); + sa = vec![0; packed_text.len()]; + let k = 1 << (libsais_sparseness * BITS_PER_CHAR); + exit_code = unsafe { libsais32x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, k, 0, null_mut()) }; + } + if exit_code == 0 { - let sparseness_factor = sparseness_factor as i64; for elem in sa.iter_mut() { - *elem *= sparseness_factor; + let libsais_sparseness = libsais_sparseness as i64; + *elem *= libsais_sparseness; } - Some(sa) - } else { None } + Ok(sa) + } else { Err("Failed building suffix array") } } #[cfg(test)] diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 14b6851..529e1be 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,9 +1,6 @@ use std::error::Error; -use crate::bitpacking::bitpack_text; use clap::{Parser, ValueEnum}; -pub mod bitpacking; - /// Build a (sparse, compressed) suffix array from the given text #[derive(Parser, Debug)] pub struct Arguments { @@ -57,14 +54,10 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => { - let packed_text = bitpack_text(text, sparseness_factor); - - libsais64_rs::sais64(&packed_text, sparseness_factor) - }, - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) - } - .ok_or("Building suffix array failed")?; + SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(&text, sparseness_factor)?, + SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")? + } + ; // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) if *construction_algorithm == SAConstructionAlgorithm::LibDivSufSort {