Skip to content

Commit

Permalink
more debug info + some movement of SA code
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed May 28, 2024
1 parent 285f246 commit 859a66b
Show file tree
Hide file tree
Showing 15 changed files with 194 additions and 98 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
target/
data/

.DS_Store
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions bitarray/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,15 @@ impl BitArray {
self.data[end_block] |= value << (64 - end_block_offset);
}

/// Returns the number of bits in a single value.
///
/// # Returns
///
/// The number of bits in a single value.
pub fn bits_per_value(&self) -> usize {
self.bits_per_value
}

/// Returns the length of the `BitArray`.
///
/// # Returns
Expand Down Expand Up @@ -266,6 +275,12 @@ mod tests {
assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]);
}

#[test]
fn test_bitarray_bits_per_value() {
let bitarray = BitArray::with_capacity(4, 40);
assert_eq!(bitarray.bits_per_value(), 40);
}

#[test]
fn test_bitarray_len() {
let bitarray = BitArray::with_capacity(4, 40);
Expand Down
4 changes: 2 additions & 2 deletions fa-compression/src/algorithm1/decode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ static PREFIXES: [&str; 3] = ["EC:", "GO:", "IPR:IPR"];
/// ```
/// use fa_compression::algorithm1::decode;
///
/// let input = &[ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39 ];
/// let input = &[ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ];
/// let result = decode(input);
/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364;IPR:IPR032635;IPR:IPR008816");
/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364");
/// ```
pub fn decode(input: &[u8]) -> String {
if input.is_empty() {
Expand Down
2 changes: 1 addition & 1 deletion fa-compression/src/algorithm1/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use super::{
/// let input = "IPR:IPR016364;EC:1.1.1.-;GO:0009279";
/// let encoded = encode(input);
///
/// assert_eq!(encoded, vec![ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117 ]);
/// assert_eq!(encoded, vec![ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ]);
/// ```
pub fn encode(input: &str) -> Vec<u8> {
if input.is_empty() {
Expand Down
6 changes: 4 additions & 2 deletions fa-compression/src/algorithm1/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,10 @@ impl BitOr for CharacterSet {
mod tests {
use super::*;

static CHARACTERS: [u8; 16] =
[b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', b';'];
static CHARACTERS: [u8; 16] = [
b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',',
b';'
];

static CHARACTER_SETS: [CharacterSet; 16] = [
CharacterSet::Empty,
Expand Down
34 changes: 25 additions & 9 deletions sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,38 +31,54 @@ fn main() {
compress_sa
} = Arguments::parse();

eprintln!();
eprintln!("📋 Started loading the taxon file...");
let taxon_id_calculator =
TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully loaded the taxon file!");
eprintln!("\tAggregation method: LCA*");

// read input
eprintln!();
eprintln!("📋 Started loading the proteins...");
let mut data =
Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!("✅ Successfully loaded the proteins!");

// calculate sparse suffix array
eprintln!();
eprintln!("📋 Started building the suffix array...");
let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));

eprintln!("Suffix array constructed successfully.");
eprintln!("sa length: {}", sa.len());
eprintln!("✅ Successfully built the suffix array!");
eprintln!("\tAmount of items: {}", sa.len());
eprintln!("\tSample rate: {}", sparseness_factor);

// open the output file
let mut file =
open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));

eprintln!();
eprintln!("📋 Started dumping the suffix array...");

if compress_sa {
let bits_per_value = (data.len() as f64).log2().ceil() as usize;

eprintln!("Compressing suffix array with {} bits per value.", bits_per_value);

if let Err(err) =
dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file)
{
eprint_and_exit(err.to_string().as_str());
};
} else if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) {
eprint_and_exit(err.to_string().as_str());

eprintln!("✅ Successfully dumped the suffix array!");
eprintln!("\tAmount of bits per item: {}", bits_per_value);
} else {
if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) {
eprint_and_exit(err.to_string().as_str());
}

eprintln!("✅ Successfully dumped the suffix array!");
eprintln!("\tAmount of bits per item: 64");
}
}

Expand Down
1 change: 1 addition & 0 deletions sa-compression/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ edition = "2021"

[dependencies]
bitarray = { path = "../bitarray" }
sa-index = { path = "../sa-index" }
15 changes: 8 additions & 7 deletions sa-compression/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use bitarray::{
Binary,
BitArray
};
use sa_index::SuffixArray;

/// Writes the compressed suffix array to a writer.
///
Expand Down Expand Up @@ -66,7 +67,7 @@ pub fn dump_compressed_suffix_array(
pub fn load_compressed_suffix_array(
reader: &mut impl BufRead,
bits_per_value: usize
) -> Result<(u8, BitArray), Box<dyn Error>> {
) -> Result<SuffixArray, Box<dyn Error>> {
// Read the sample rate from the binary file (1 byte)
let mut sample_rate_buffer = [0_u8; 1];
reader
Expand All @@ -87,7 +88,7 @@ pub fn load_compressed_suffix_array(
.read_binary(reader)
.map_err(|_| "Could not read the compressed suffix array from the binary file")?;

Ok((sample_rate, compressed_suffix_array))
Ok(SuffixArray::Compressed(compressed_suffix_array, sample_rate))
}

#[cfg(test)]
Expand Down Expand Up @@ -209,12 +210,11 @@ mod tests {
];

let mut reader = std::io::BufReader::new(&data[..]);
let (sample_rate, compressed_suffix_array) =
load_compressed_suffix_array(&mut reader, 8).unwrap();
let compressed_suffix_array = load_compressed_suffix_array(&mut reader, 8).unwrap();

assert_eq!(sample_rate, 1);
assert_eq!(compressed_suffix_array.sample_rate(), 1);
for i in 0 .. 10 {
assert_eq!(compressed_suffix_array.get(i), i as u64 + 1);
assert_eq!(compressed_suffix_array.get(i), i as i64 + 1);
}
}

Expand Down Expand Up @@ -262,7 +262,8 @@ mod tests {
let mut reader = FailingReader {
valid_read_count: 0
};
assert_eq!(reader.fill_buf().unwrap(), &[]);
let right_buffer: [u8; 0] = [];
assert_eq!(reader.fill_buf().unwrap(), &right_buffer);
assert_eq!(reader.consume(0), ());
let mut buffer = [0_u8; 1];
assert!(reader.read(&mut buffer).is_err());
Expand Down
14 changes: 9 additions & 5 deletions sa-index/src/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use std::{
}
};

use crate::SuffixArray;

/// The `Binary` trait provides methods for reading and writing a struct as binary.
pub trait Binary {
/// Writes the struct as binary to the given writer.
Expand Down Expand Up @@ -132,7 +134,7 @@ pub fn dump_suffix_array(
/// # Errors
///
/// Returns any error from opening the file or reading the file
pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec<i64>), Box<dyn Error>> {
pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<SuffixArray, Box<dyn Error>> {
// Read the sample rate from the binary file (1 byte)
let mut sample_rate_buffer = [0_u8; 1];
reader
Expand All @@ -151,7 +153,7 @@ pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec<i64>), Bo
sa.read_binary(reader)
.map_err(|_| "Could not read the suffix array from the binary file")?;

Ok((sample_rate, sa))
Ok(SuffixArray::Original(sa, sample_rate))
}

/// Fills the buffer with data read from the input.
Expand Down Expand Up @@ -374,10 +376,12 @@ mod tests {
];

let mut reader = buffer.as_slice();
let (sample_rate, sa) = load_suffix_array(&mut reader).unwrap();
let sa = load_suffix_array(&mut reader).unwrap();

assert_eq!(sample_rate, 1);
assert_eq!(sa, vec![1, 2, 3, 4, 5]);
assert_eq!(sa.sample_rate(), 1);
for i in 0 .. 5 {
assert_eq!(sa.get(i), i as i64 + 1);
}
}

#[test]
Expand Down
78 changes: 66 additions & 12 deletions sa-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ pub mod suffix_to_protein_index;
/// Represents a suffix array.
pub enum SuffixArray {
/// The original suffix array.
Original(Vec<i64>),
Original(Vec<i64>, u8),
/// The compressed suffix array.
Compressed(BitArray)
Compressed(BitArray, u8)
}

impl SuffixArray {
Expand All @@ -21,12 +21,36 @@ impl SuffixArray {
/// The length of the suffix array.
pub fn len(&self) -> usize {
match self {
SuffixArray::Original(sa) => sa.len(),
SuffixArray::Compressed(sa) => sa.len()
SuffixArray::Original(sa, _) => sa.len(),
SuffixArray::Compressed(sa, _) => sa.len()
}
}

/// Returns the suffix array at the given index.
/// Returns the number of bits per value in the suffix array.
///
/// # Returns
///
/// The number of bits per value in the suffix array.
pub fn bits_per_value(&self) -> usize {
match self {
SuffixArray::Original(_, _) => 64,
SuffixArray::Compressed(sa, _) => sa.bits_per_value()
}
}

/// Returns the sample rate used for the suffix array.
///
/// # Returns
///
/// The sample rate used for the suffix array.
pub fn sample_rate(&self) -> u8 {
match self {
SuffixArray::Original(_, sample_rate) => *sample_rate,
SuffixArray::Compressed(_, sample_rate) => *sample_rate
}
}

/// Returns the suffix array value at the given index.
///
/// # Arguments
///
Expand All @@ -37,16 +61,16 @@ impl SuffixArray {
/// The suffix array at the given index.
pub fn get(&self, index: usize) -> i64 {
match self {
SuffixArray::Original(sa) => sa[index],
SuffixArray::Compressed(sa) => sa.get(index) as i64
SuffixArray::Original(sa, _) => sa[index],
SuffixArray::Compressed(sa, _) => sa.get(index) as i64
}
}

/// Returns whether the suffix array is empty.
///
/// # Returns
///
/// True if the suffix array is empty, false otherwise.
/// Returns `true` if the suffix array is empty, `false` otherwise.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
Expand Down Expand Up @@ -79,7 +103,7 @@ mod tests {

#[test]
fn test_suffix_array_original() {
let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5]);
let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1);
assert_eq!(sa.len(), 5);
assert_eq!(sa.get(0), 1);
assert_eq!(sa.get(1), 2);
Expand All @@ -97,7 +121,7 @@ mod tests {
bitarray.set(3, 4);
bitarray.set(4, 5);

let sa = SuffixArray::Compressed(bitarray);
let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.len(), 5);
assert_eq!(sa.get(0), 1);
assert_eq!(sa.get(1), 2);
Expand All @@ -106,13 +130,43 @@ mod tests {
assert_eq!(sa.get(4), 5);
}

#[test]
fn test_suffix_array_len() {
let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1);
assert_eq!(sa.len(), 5);

let bitarray = BitArray::with_capacity(5, 40);
let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.len(), 5);
}

#[test]
fn test_suffix_array_bits_per_value() {
let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1);
assert_eq!(sa.bits_per_value(), 64);

let bitarray = BitArray::with_capacity(5, 40);
let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.bits_per_value(), 40);
}

#[test]
fn test_suffix_array_sample_rate() {
let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1);
assert_eq!(sa.sample_rate(), 1);

let bitarray = BitArray::with_capacity(5, 40);
let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.sample_rate(), 1);
}

#[test]
fn test_suffix_array_is_empty() {
let sa = SuffixArray::Original(vec![]);
let sa = SuffixArray::Original(vec![], 1);
assert_eq!(sa.is_empty(), true);

let bitarray = BitArray::with_capacity(0, 0);
let sa = SuffixArray::Compressed(bitarray);
let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.is_empty(), true);
}

Expand Down
2 changes: 1 addition & 1 deletion sa-index/src/peptide_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ pub fn search_proteins_for_peptide<'a>(
let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase();

// words that are shorter than the sample rate are not searchable
if peptide.len() < searcher.sparseness_factor as usize {
if peptide.len() < searcher.sa.sample_rate() as usize {
return None;
}

Expand Down
Loading

0 comments on commit 859a66b

Please sign in to comment.