Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SA construction #29

Draft
wants to merge 26 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c94df8d
SA construction with bitpacking
SimonVandeVyver Oct 14, 2024
850b2ee
use global variable for bits_per_char
SimonVandeVyver Oct 14, 2024
c09b61a
fixed tests for libsais with bitpacking
SimonVandeVyver Oct 14, 2024
8d72316
remove debug code
SimonVandeVyver Oct 14, 2024
1b15fad
remove unused import
SimonVandeVyver Oct 14, 2024
9053d25
bugfix shift overflow
SimonVandeVyver Oct 14, 2024
0424cd3
use adapted libsais library
SimonVandeVyver Oct 18, 2024
0ecb79e
support for sparseness factor up to 6
SimonVandeVyver Oct 22, 2024
bba56bb
set max alfabet size to 2^28
SimonVandeVyver Oct 22, 2024
ae2e5fa
bugfix: check if libsais_sparseness big enough
SimonVandeVyver Oct 22, 2024
f1dc185
allow all sparseness factors
SimonVandeVyver Oct 22, 2024
13a6d69
minor changes
SimonVandeVyver Oct 23, 2024
64e5589
add support for 8-bit libsais text
SimonVandeVyver Oct 23, 2024
aa6ef9f
add comments + do not bitpack text sparseness is 1
SimonVandeVyver Oct 24, 2024
b9064e6
add comments
SimonVandeVyver Oct 24, 2024
944adc2
add comment
SimonVandeVyver Oct 24, 2024
6dccb19
run clippy
SimonVandeVyver Oct 24, 2024
d2feb52
cargo fmt
SimonVandeVyver Oct 24, 2024
b91a759
allow sparseness 6 for libsais
SimonVandeVyver Oct 25, 2024
d436dfd
keep bitpacked text owned before use
SimonVandeVyver Oct 25, 2024
4fd936f
set max sparseness to 5
SimonVandeVyver Oct 25, 2024
0fc7431
take ownership of text when bitpacking, don't keep both texts in memory
SimonVandeVyver Oct 25, 2024
bfd9069
adapt tests to pass text not as reference
SimonVandeVyver Oct 25, 2024
1ff2e0f
cargo fmt
SimonVandeVyver Oct 28, 2024
0879f4e
cargo clippy
SimonVandeVyver Oct 28, 2024
1cd0465
change link to unipept-libsais to https
SimonVandeVyver Oct 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ resolver = "2"
members = [ "bitarray",
"fa-compression",
"libsais64-rs",
"sa-builder", "sa-compression",
"sa-builder",
"sa-compression",
"sa-index",
"sa-mappings",
"sa-server"
Expand Down
2 changes: 1 addition & 1 deletion fa-compression/benches/algorithm2/decode.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::black_box;
use fa_compression::algorithm2::{decode, encode, CompressionTable};
use fa_compression::algorithm2::{CompressionTable, decode, encode};

use super::util::generate_annotation;

Expand Down
2 changes: 1 addition & 1 deletion fa-compression/benches/algorithm2/encode.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::black_box;
use fa_compression::algorithm2::{encode, CompressionTable};
use fa_compression::algorithm2::{CompressionTable, encode};

use super::util::generate_annotation;

Expand Down
2 changes: 1 addition & 1 deletion fa-compression/benches/util.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use rand::{rngs::ThreadRng, Rng};
use rand::{Rng, rngs::ThreadRng};

/// Generate a random InterPro annotation.
pub fn generate_ipr(random: &mut ThreadRng) -> String {
Expand Down
2 changes: 1 addition & 1 deletion libsais64-rs/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ fn main() -> Result<(), Box<dyn Error>> {

// clone the c library
Command::new("git")
.args(["clone", "https://github.com/IlyaGrebnov/libsais.git", "--depth=1"])
.args(["clone", "https://github.com/unipept/unipept-libsais.git", "libsais", "--depth=1"])
.status()
.expect("Failed to clone the libsais repository");

Expand Down
6 changes: 5 additions & 1 deletion libsais64-rs/libsais-wrapper.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
#include "libsais/include/libsais64.h"
#include "libsais/include/libsais16x64.h"


int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq);

int64_t libsais32x64(const uint32_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t * freq);

int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq);
107 changes: 107 additions & 0 deletions libsais64-rs/src/bitpacking.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Function to get the rank of a character
fn get_rank(c: u8) -> u8 {
match c {
b'$' => 0,
b'-' => 1,
_ => 2 + (c - b'A')
}
}

// Amount of bits necessary to represent one character in the protein text.
pub const BITS_PER_CHAR: usize = 5;

// Bitpack text in a vector of u8 elements. BITS_PER_CHAR * sparseness_factor <= 8.
pub fn bitpack_text_8(text: Vec<u8>, sparseness_factor: usize) -> Vec<u8> {
assert!(BITS_PER_CHAR * sparseness_factor <= 8);

let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor;
let mut text_packed = vec![0; num_ints];

if text.is_empty() {
return text_packed;
}

for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) {
let ti = i * sparseness_factor;
*element = 0u8;
for j in 0..sparseness_factor {
let rank_c = get_rank(text[ti + j]);
*element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j));
}
}

// Handle the last element
let mut last_element = 0u8;
let last_el_start = sparseness_factor * (num_ints - 1);
for i in 0..((text.len() - 1) % sparseness_factor + 1) {
let rank_c = get_rank(text[last_el_start + i]);
last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i));
}
text_packed[num_ints - 1] = last_element;

text_packed
}

// Bitpack text in a vector of u16 elements. BITS_PER_CHAR * sparseness_factor <= 16.
pub fn bitpack_text_16(text: Vec<u8>, sparseness_factor: usize) -> Vec<u16> {
assert!(BITS_PER_CHAR * sparseness_factor <= 16);

let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor;
let mut text_packed = vec![0; num_ints];

if text.is_empty() {
return text_packed;
}

for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) {
let ti = i * sparseness_factor;
*element = 0u16;
for j in 0..sparseness_factor {
let rank_c = get_rank(text[ti + j]) as u16;
*element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j));
}
}

// Handle the last element
let mut last_element = 0u16;
let last_el_start = sparseness_factor * (num_ints - 1);
for i in 0..((text.len() - 1) % sparseness_factor + 1) {
let rank_c = get_rank(text[last_el_start + i]) as u16;
last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i));
}
text_packed[num_ints - 1] = last_element;

text_packed
}

// Bitpack text in a vector of u32 elements. BITS_PER_CHAR * sparseness_factor <= 32.
pub fn bitpack_text_32(text: Vec<u8>, sparseness_factor: usize) -> Vec<u32> {
assert!(BITS_PER_CHAR * sparseness_factor <= 32);

let num_ints = (text.len() + (sparseness_factor - 1)) / sparseness_factor;
let mut text_packed = vec![0; num_ints];

if text.is_empty() {
return text_packed;
}

for (i, element) in text_packed.iter_mut().enumerate().take(num_ints - 1) {
let ti = i * sparseness_factor;
*element = 0u32;
for j in 0..sparseness_factor {
let rank_c = get_rank(text[ti + j]) as u32;
*element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - j));
}
}

// Handle the last element
let mut last_element = 0u32;
let last_el_start = sparseness_factor * (num_ints - 1);
for i in 0..((text.len() - 1) % sparseness_factor + 1) {
let rank_c = get_rank(text[last_el_start + i]) as u32;
last_element |= rank_c << (BITS_PER_CHAR * (sparseness_factor - 1 - i));
}
text_packed[num_ints - 1] = last_element;

text_packed
}
54 changes: 46 additions & 8 deletions libsais64-rs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
use std::ptr::null_mut;

use crate::bitpacking::{BITS_PER_CHAR, bitpack_text_8, bitpack_text_16, bitpack_text_32};
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

/// Builds the suffix array over the `text` using the libsais64 algorithm
pub mod bitpacking;

/// Builds the suffix array over the `text` using the libsais algorithm
///
/// # Arguments
/// * `text` - The text used for suffix array construction
Expand All @@ -13,10 +18,41 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
///
/// Returns Some with the suffix array build over the text if construction succeeds
/// Returns None if construction of the suffix array failed
pub fn sais64(text: &[u8]) -> Option<Vec<i64>> {
let mut sa = vec![0; text.len()];
let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) };
if exit_code == 0 { Some(sa) } else { None }
pub fn sais64(text: Vec<u8>, libsais_sparseness: usize) -> Result<Vec<i64>, &'static str> {
let exit_code;
let mut sa;

let required_bits = libsais_sparseness * BITS_PER_CHAR;
if required_bits <= 8 {
// bitpacked values fit in uint8_t
let packed_text = if libsais_sparseness == 1 { text } else { bitpack_text_8(text, libsais_sparseness) };

sa = vec![0; packed_text.len()];
exit_code =
unsafe { libsais64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) };
} else if required_bits <= 16 {
// bitpacked values fit in uint16_t
let packed_text = bitpack_text_16(text, libsais_sparseness);
sa = vec![0; packed_text.len()];
exit_code =
unsafe { libsais16x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, 0, null_mut()) };
} else {
let packed_text = bitpack_text_32(text, libsais_sparseness);
sa = vec![0; packed_text.len()];
let k = 1 << (libsais_sparseness * BITS_PER_CHAR);
exit_code =
unsafe { libsais32x64(packed_text.as_ptr(), sa.as_mut_ptr(), packed_text.len() as i64, k, 0, null_mut()) };
}

if exit_code == 0 {
for elem in sa.iter_mut() {
let libsais_sparseness = libsais_sparseness as i64;
*elem *= libsais_sparseness;
}
Ok(sa)
} else {
Err("Failed building suffix array")
}
}

#[cfg(test)]
Expand All @@ -25,8 +61,10 @@ mod tests {

#[test]
fn check_build_sa_with_libsais64() {
let text = "banana$";
let sa = sais64(text.as_bytes());
assert_eq!(sa, Some(vec![6, 5, 3, 1, 0, 4, 2]));
let sparseness_factor = 4;
let text = "BANANA-BANANA$".as_bytes().to_vec();
let sa = sais64(text, sparseness_factor);
let correct_sa: Vec<i64> = vec![12, 8, 0, 4];
assert_eq!(sa, Ok(correct_sa));
}
}
2 changes: 1 addition & 1 deletion sa-builder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Options:
-o, --output <OUTPUT>
Output location where to store the suffix array
-s, --sparseness-factor <SPARSENESS_FACTOR>
The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used) [default: 1]
The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used). Internally, a library call will be performed with a maximum sparseness of 5 (because of memory usage). If a higher sparsity is desired, the largest divisor smaller than or equal to 5 is used for the library call. Then, the SA is filtered to achieve the desired sparsity. [default: 1]
-a, --construction-algorithm <CONSTRUCTION_ALGORITHM>
The algorithm used to construct the suffix array (default value LibSais) [default: lib-sais] [possible values: lib-div-suf-sort, lib-sais]
-c, --compress-sa
Expand Down
54 changes: 41 additions & 13 deletions sa-builder/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,50 @@
///
/// The errors that occurred during the building of the suffix array itself
pub fn build_ssa(
text: &mut Vec<u8>,
mut text: Vec<u8>,
construction_algorithm: &SAConstructionAlgorithm,
sparseness_factor: u8
) -> Result<Vec<i64>, Box<dyn Error>> {
// translate all L's to a I
translate_l_to_i(text);
translate_l_to_i(&mut text);

// Build the suffix array using the selected algorithm
let mut sa = match construction_algorithm {
SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(text),
SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text)
}
.ok_or("Building suffix array failed")?;
SAConstructionAlgorithm::LibSais => libsais64(text, sparseness_factor)?,
SAConstructionAlgorithm::LibDivSufSort => {
libdivsufsort_rs::divsufsort64(&text).ok_or("Building suffix array failed")?
}
};

// make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1)
sample_sa(&mut sa, sparseness_factor);
if *construction_algorithm == SAConstructionAlgorithm::LibDivSufSort {
sample_sa(&mut sa, sparseness_factor);
}

Ok(sa)
}

// Max sparseness for libsais because it creates a bucket for each element of the alphabet (2 ^ (sparseness * bits_per_char) buckets).
const MAX_SPARSENESS: usize = 5;
fn libsais64(text: Vec<u8>, sparseness_factor: u8) -> Result<Vec<i64>, &'static str> {
let sparseness_factor = sparseness_factor as usize;

// set libsais_sparseness to highest sparseness factor fitting in 32-bit value and sparseness factor divisible by libsais sparseness
// max 28 out of 32 bits used, because a bucket is created for every element of the alfabet 8 * 2^28).
let mut libsais_sparseness = MAX_SPARSENESS;
while sparseness_factor % libsais_sparseness != 0 {
libsais_sparseness -= 1;
}
let sample_rate = sparseness_factor / libsais_sparseness;
eprintln!("\tSparseness factor: {}", sparseness_factor);
eprintln!("\tLibsais sparseness factor: {}", libsais_sparseness);
eprintln!("\tSample rate: {}", sample_rate);

let mut sa = libsais64_rs::sais64(text, libsais_sparseness)?;

if sample_rate > 1 {
sample_sa(&mut sa, sample_rate as u8);
}

Ok(sa)
}
Expand Down Expand Up @@ -146,43 +174,43 @@

#[test]
fn test_build_ssa_libsais() {
let mut text = b"ABRACADABRA$".to_vec();

Check warning on line 177 in sa-builder/src/lib.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable
let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap();
let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 1).unwrap();
assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
}

#[test]
fn test_build_ssa_libsais_empty() {
let mut text = b"".to_vec();

Check warning on line 184 in sa-builder/src/lib.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable
let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap();
let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 1).unwrap();
assert_eq!(sa, vec![]);
}

#[test]
fn test_build_ssa_libsais_sparse() {
let mut text = b"ABRACADABRA$".to_vec();

Check warning on line 191 in sa-builder/src/lib.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable
let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 2).unwrap();
let sa = build_ssa(text, &SAConstructionAlgorithm::LibSais, 2).unwrap();
assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]);
}

#[test]
fn test_build_ssa_libdivsufsort() {
let mut text = b"ABRACADABRA$".to_vec();

Check warning on line 198 in sa-builder/src/lib.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable
let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap();
let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap();
assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
}

#[test]
fn test_build_ssa_libdivsufsort_empty() {
let mut text = b"".to_vec();

Check warning on line 205 in sa-builder/src/lib.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable
let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap();
let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap();
assert_eq!(sa, vec![]);
}

#[test]
fn test_build_ssa_libdivsufsort_sparse() {
let mut text = b"ABRACADABRA$".to_vec();

Check warning on line 212 in sa-builder/src/lib.rs

View workflow job for this annotation

GitHub Actions / Check + test

variable does not need to be mutable
let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap();
let sa = build_ssa(text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap();
assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]);
}

Expand Down
10 changes: 4 additions & 6 deletions sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::{
};

use clap::Parser;
use sa_builder::{build_ssa, Arguments};
use sa_builder::{Arguments, build_ssa};
use sa_compression::dump_compressed_suffix_array;
use sa_index::binary::dump_suffix_array;
use sa_mappings::proteins::Proteins;
Expand All @@ -21,8 +21,9 @@ fn main() {
eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
let data = Proteins::try_from_database_file_uncompressed(&database_file)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
let bits_per_value = (data.len() as f64).log2().ceil() as usize;
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
(get_time_ms().unwrap() - start_proteins_time) / 1000.0
Expand All @@ -31,14 +32,13 @@ fn main() {
eprintln!();
eprintln!("📋 Started building the suffix array...");
let start_ssa_time = get_time_ms().unwrap();
let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor)
let sa = build_ssa(data, &construction_algorithm, sparseness_factor)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!(
"✅ Successfully built the suffix array in {} seconds!",
(get_time_ms().unwrap() - start_ssa_time) / 1000.0
);
eprintln!("\tAmount of items: {}", sa.len());
eprintln!("\tSample rate: {}", sparseness_factor);

// open the output file
let mut file =
Expand All @@ -49,8 +49,6 @@ fn main() {
let start_dump_time = get_time_ms().unwrap();

if compress_sa {
let bits_per_value = (data.len() as f64).log2().ceil() as usize;

if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) {
eprint_and_exit(err.to_string().as_str());
};
Expand Down
Loading
Loading