Skip to content

Commit

Permalink
support for sparseness factor up to 6
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonVandeVyver committed Oct 22, 2024
1 parent 0424cd3 commit 0ecb79e
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 21 deletions.
4 changes: 3 additions & 1 deletion libsais64-rs/libsais-wrapper.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "libsais/include/libsais16x64.h"


int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq);
int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq);

int64_t libsais32x64(const uint32_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t * freq);
36 changes: 34 additions & 2 deletions sa-builder/src/bitpacking.rs → libsais64-rs/src/bitpacking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ fn get_rank(c: u8) -> u8 {
}

pub const BITS_PER_CHAR: usize = 5;
pub fn bitpack_text(text: &Vec<u8>, sparseness_factor: u8) -> Vec<u16> {
let sparseness_factor = sparseness_factor as usize;
pub fn bitpack_text_16(text: &Vec<u8>, sparseness_factor: usize) -> Vec<u16> {

let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor;
let mut text_packed = vec![0; num_ints];

Expand Down Expand Up @@ -40,4 +40,36 @@ pub fn bitpack_text(text: &Vec<u8>, sparseness_factor: u8) -> Vec<u16> {

text_packed

}

pub fn bitpack_text_32(text: &Vec<u8>, sparseness_factor: usize) -> Vec<u32> {

let num_ints = (text.len() + (sparseness_factor-1)) / sparseness_factor;
let mut text_packed = vec![0; num_ints];

if text.len() == 0 {
return text_packed;
}

for i in 0..(num_ints-1) {
let ti = i * sparseness_factor;
let mut element = 0u32;
for j in 0..sparseness_factor {
let rank_c = get_rank(text[ti + j]) as u32;
element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j));
}
text_packed[i] = element;
}

// Handle the last element
let mut last_element = 0u32;
let last_el_start = sparseness_factor * (num_ints - 1);
for i in 0..((text.len() - 1) % sparseness_factor + 1) {
let rank_c = get_rank(text[last_el_start + i]) as u32;
last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i));
}
text_packed[num_ints - 1] = last_element;

text_packed

}
43 changes: 36 additions & 7 deletions libsais64-rs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
use std::ptr::null_mut;
use crate::bitpacking::{bitpack_text_16, bitpack_text_32, BITS_PER_CHAR};
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

pub mod bitpacking;

/// Builds the suffix array over the `text` using the libsais64 algorithm
///
/// # Arguments
Expand All @@ -14,16 +17,42 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
///
/// Returns Some with the suffix array build over the text if construction succeeds
/// Returns None if construction of the suffix array failed
pub fn sais64(text: &Vec<u16>, sparseness_factor: u8) -> Option<Vec<i64>> {
let mut sa = vec![0; text.len()];
let exit_code = unsafe { libsais16x64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, null_mut()) };
pub fn sais64(text: &Vec<u8>, sparseness_factor: u8) -> Result<Vec<i64>, &str> {
let sparseness_factor = sparseness_factor as usize;
let mut libsais_sparseness = sparseness_factor;
let mut sa;
let exit_code;

if sparseness_factor * BITS_PER_CHAR <= 16 {
// bitpacked values fit in uint16_t
let packed_text = bitpack_text_16(text, libsais_sparseness);
sa = vec![0; packed_text.len()];
exit_code = unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) };
} else {
// bitpacked values do not fit in uint16_t, use 32-bit text
// set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness
libsais_sparseness = 32 / BITS_PER_CHAR;
while sparseness_factor % libsais_sparseness != 0 && libsais_sparseness * BITS_PER_CHAR > 16 {
libsais_sparseness -= 1;
}

if sparseness_factor % libsais_sparseness != 0 {
return Err("invalid sparseness factor");
}

let packed_text = bitpack_text_32(text, libsais_sparseness);
sa = vec![0; packed_text.len()];
let k = 1 << (libsais_sparseness * BITS_PER_CHAR);
exit_code = unsafe { libsais32x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, k, 0, null_mut()) };
}

if exit_code == 0 {
let sparseness_factor = sparseness_factor as i64;
for elem in sa.iter_mut() {
*elem *= sparseness_factor;
let libsais_sparseness = libsais_sparseness as i64;
*elem *= libsais_sparseness;
}
Some(sa)
} else { None }
Ok(sa)
} else { Err("Failed building suffix array") }
}

#[cfg(test)]
Expand Down
15 changes: 4 additions & 11 deletions sa-builder/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
use std::error::Error;
use crate::bitpacking::bitpack_text;
use clap::{Parser, ValueEnum};

pub mod bitpacking;

/// Build a (sparse, compressed) suffix array from the given text
#[derive(Parser, Debug)]
pub struct Arguments {
Expand Down Expand Up @@ -57,14 +54,10 @@ pub fn build_ssa(

// Build the suffix array using the selected algorithm
let mut sa = match construction_algorithm {
SAConstructionAlgorithm::LibSais => {
let packed_text = bitpack_text(text, sparseness_factor);

libsais64_rs::sais64(&packed_text, sparseness_factor)
},
SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text)
}
.ok_or("Building suffix array failed")?;
SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(&text, sparseness_factor)?,
SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text).ok_or("Building suffix array failed")?
}
;

// make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1)
if *construction_algorithm == SAConstructionAlgorithm::LibDivSufSort {
Expand Down

0 comments on commit 0ecb79e

Please sign in to comment.