From aa8f2b759ae0ea4b1637ba345f2274b604b0bc1d Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 16 May 2024 15:03:50 +0200 Subject: [PATCH 01/26] bitarray compression --- Cargo.lock | 4 ++ Cargo.toml | 2 +- bitarray/Cargo.toml | 8 +++ bitarray/src/binary.rs | 127 +++++++++++++++++++++++++++++++++++++++++ bitarray/src/lib.rs | 101 ++++++++++++++++++++++++++++++++ 5 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 bitarray/Cargo.toml create mode 100644 bitarray/src/binary.rs create mode 100644 bitarray/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 27c3eed..56db76a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,6 +240,10 @@ dependencies = [ "which", ] +[[package]] +name = "bitarray" +version = "0.1.0" + [[package]] name = "bitflags" version = "1.3.2" diff --git a/Cargo.toml b/Cargo.toml index 728b06e..617ad70 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [workspace] resolver = "2" -members = [ +members = [ "bitarray", "fa-compression", "libsais64-rs", "sa-builder", diff --git a/bitarray/Cargo.toml b/bitarray/Cargo.toml new file mode 100644 index 0000000..8176d57 --- /dev/null +++ b/bitarray/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "bitarray" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs new file mode 100644 index 0000000..546b65c --- /dev/null +++ b/bitarray/src/binary.rs @@ -0,0 +1,127 @@ +use std::io::{BufRead, Read, Result, Write}; + +use crate::BitArray; + +pub trait Binary { + fn write_binary(&self, writer: W) -> Result<()>; + fn read_binary(&mut self, reader: R) -> Result<()>; +} + +impl Binary for BitArray { + fn write_binary(&self, mut writer: W) -> Result<()> { + for value in self.data.iter() { + writer.write_all(&value.to_le_bytes())?; + } + + Ok(()) + } + + fn read_binary(&mut self, mut reader: R) -> Result<()> { + self.data.clear(); + + let mut buffer = vec![0; 8 * 1024]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); + } + + if finished { + break; + } + } + + Ok(()) + } +} + +fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { + // Store the buffer size in advance, because rust will complain + // about the buffer being borrowed mutably while it's borrowed + let buffer_size = buffer.len(); + + let mut writable_buffer_space = buffer.as_mut(); + + loop { + match input.read(writable_buffer_space) { + // No bytes written, which means we've completely filled the buffer + // or we've reached the end of the file + Ok(0) => { + return ( + !writable_buffer_space.is_empty(), + buffer_size - writable_buffer_space.len() + ); + } + + // We've read {bytes_read} bytes + Ok(bytes_read) => { + // Shrink the writable buffer slice + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + } + + Err(err) => { + panic!("Error while reading input: {}", err); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fill_buffer() { + let input_str = "a".repeat(8_000); + let mut input = input_str.as_bytes(); + + let mut buffer = vec![0; 800]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); + + if finished { + assert!(bytes_read < 800); + break; + } else { + assert_eq!(bytes_read, 800); + } + } + } + + #[test] + fn test_write_binary() { + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.set(0, 0x1234567890); + bitarray.set(1, 0xabcdef0123); + bitarray.set(2, 0x4567890abc); + bitarray.set(3, 0xdef0123456); + + let mut buffer = Vec::new(); + bitarray.write_binary(&mut buffer).unwrap(); + + assert_eq!(buffer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, + 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); + } + + #[test] + fn test_read_binary() { + let buffer = vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, + 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]; + + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.read_binary(&buffer[..]).unwrap(); + + assert_eq!(bitarray.get(0), 0x1234567890); + assert_eq!(bitarray.get(1), 0xabcdef0123); + assert_eq!(bitarray.get(2), 0x4567890abc); + assert_eq!(bitarray.get(3), 0xdef0123456); + } +} diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs new file mode 100644 index 0000000..d8a1e9b --- /dev/null +++ b/bitarray/src/lib.rs @@ -0,0 +1,101 @@ +pub mod binary; + +pub struct BitArray { + pub data: Vec, + pub mask: u64, + pub len: usize, +} + +impl BitArray { + pub fn with_capacity(capacity: usize) -> Self { + Self { + data: vec![0; capacity * B / 64 + 1], + mask: (1 << B) - 1, + len: capacity, + } + } + + pub fn get(&self, index: usize) -> u64 { + let start_block = index * B / 64; + let start_block_offset = index * B % 64; + + if start_block_offset + B <= 64 { + return self.data[start_block] >> (64 - start_block_offset - B) & self.mask; + } + + let end_block = (index + 1) * B / 64; + let end_block_offset = (index + 1) * B % 64; + + let a = self.data[start_block] << end_block_offset; + let b = self.data[end_block] >> (64 - end_block_offset); + + (a | b) & self.mask + } + + pub fn set(&mut self, index: usize, value: u64) { + let start_block = index * B / 64; + let start_block_offset = index * B % 64; + + if start_block_offset + B <= 64 { + self.data[start_block] &= !(self.mask << (64 - start_block_offset - B)); + self.data[start_block] |= value << (64 - start_block_offset - B); + return; + } + + let end_block = (index + 1) * B / 64; + let end_block_offset = (index + 1) * B % 64; + + self.data[start_block] &= !(self.mask >> start_block_offset); + self.data[start_block] |= value >> end_block_offset; + + self.data[end_block] &= !(self.mask << (64 - end_block_offset)); + self.data[end_block] |= value << (64 - end_block_offset); + } + + pub fn len(&self) -> usize { + self.len + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bitarray_with_capacity() { + let bitarray = BitArray::<40>::with_capacity(4); + assert_eq!(bitarray.data, vec![ 0, 0, 0 ]); + assert_eq!(bitarray.mask, 0xff_ffff_ffff); + assert_eq!(bitarray.len, 4); + } + + #[test] + fn test_bitarray_get() { + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.data = vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4 ]; + + assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010); + assert_eq!(bitarray.get(1), 0b1100001001010010011000010100110111001001); + assert_eq!(bitarray.get(2), 0b1111001101001101101101101011101001010001); + assert_eq!(bitarray.get(3), 0b0000100010010001010001001110101110011100); + } + + #[test] + fn test_bitarray_set() { + let mut bitarray = BitArray::<40>::with_capacity(4); + bitarray.data = vec![ 0, 0, 0 ]; + + bitarray.set(0, 0b0001110011111010110001000111111100110010); + bitarray.set(1, 0b1100001001010010011000010100110111001001); + bitarray.set(2, 0b1111001101001101101101101011101001010001); + bitarray.set(3, 0b0000100010010001010001001110101110011100); + + assert_eq!(bitarray.data, vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000 ]); + } + + #[test] + fn test_bitarray_len() { + let bitarray = BitArray::<40>::with_capacity(4); + assert_eq!(bitarray.len(), 4); + } +} \ No newline at end of file From 84edc18c4fe3b3412a0d4bc4d7e43f97f0170dc9 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 16 May 2024 15:25:46 +0200 Subject: [PATCH 02/26] add coverage for bitarray and small README --- .github/workflows/coverage.yml | 29 +++++++++++++++++++++++++++++ bitarray/README.md | 21 +++++++++++++++++++++ bitarray/src/lib.rs | 1 - codecov.yml | 12 ++++++++++++ 4 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 bitarray/README.md diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index bf98de4..88368eb 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -19,6 +19,33 @@ jobs: toolchain: nightly override: true + + + - name: Run cargo test (bitarray) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p bitarray + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (bitarray) + id: coverage-bitarray + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (bitarray) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-bitarray.outputs.report }} + flags: bitarray + verbose: true + fail_ci_if_error: true + + + - name: Run cargo test (fa-compression) uses: actions-rs/cargo@v1 with: @@ -42,6 +69,8 @@ jobs: verbose: true fail_ci_if_error: true + + - name: Run cargo test (sa-mappings) uses: actions-rs/cargo@v1 with: diff --git a/bitarray/README.md b/bitarray/README.md new file mode 100644 index 0000000..074f7c5 --- /dev/null +++ b/bitarray/README.md @@ -0,0 +1,21 @@ +# Bitarray + +![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/unipept/unipept-index/test.yml?logo=github) +![Codecov](https://img.shields.io/codecov/c/github/unipept/unipept-index?token=IZ75A2FY98&flag=bitarray&logo=codecov) +![Static Badge](https://img.shields.io/badge/doc-rustdoc-blue) + +The `bitarray` offers a special array where each item is represented by a specified amount of bits (smaller than 64). The bitarray uses a pre-alocated vector and allows you to `set` or `get` a value from the array. + +## Example + +```rust +use bitarray; + +fn main() { + let bitarray = BitArray::<40>::with_capacity(4); + + bitarray.set(0, 0b0001110011111010110001000111111100110010); + + assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010); +} +``` diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index d8a1e9b..5a721dc 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -83,7 +83,6 @@ mod tests { #[test] fn test_bitarray_set() { let mut bitarray = BitArray::<40>::with_capacity(4); - bitarray.data = vec![ 0, 0, 0 ]; bitarray.set(0, 0b0001110011111010110001000111111100110010); bitarray.set(1, 0b1100001001010010011000010100110111001001); diff --git a/codecov.yml b/codecov.yml index 030fcbe..6d769bf 100644 --- a/codecov.yml +++ b/codecov.yml @@ -3,6 +3,10 @@ coverage: project: default: target: 90% + bitarray: + target: 90% + flags: + - bitarray fa-compression: target: 90% flags: @@ -14,6 +18,10 @@ coverage: patch: default: target: 90% + bitarray: + target: 90% + flags: + - bitarray fa-compression: target: 90% flags: @@ -24,6 +32,10 @@ coverage: - sa-mappings flags: + bitarray: + paths: + - bitarray + carryforward: true fa-compression: paths: - fa-compression From a6180192f0391ccb3e4f8cc1578042f3ded3aab4 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 16 May 2024 15:51:29 +0200 Subject: [PATCH 03/26] document library --- bitarray/src/binary.rs | 69 ++++++++++++++++++++++++++++++++++++++++++ bitarray/src/lib.rs | 60 +++++++++++++++++++++++++++++++++--- 2 files changed, 125 insertions(+), 4 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index 546b65c..67a7be4 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -1,13 +1,45 @@ +//! This module provides utilities for reading and writing the bitarray as binary. + use std::io::{BufRead, Read, Result, Write}; use crate::BitArray; +/// The `Binary` trait provides methods for reading and writing a struct as binary. pub trait Binary { + /// Writes the struct as binary to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. fn write_binary(&self, writer: W) -> Result<()>; + + /// Reads binary data into a struct from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader to read the binary data from. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `Err` if an error occurs. fn read_binary(&mut self, reader: R) -> Result<()>; } +/// Implementation of the `Binary` trait for the `BitArray` struct. impl Binary for BitArray { + /// Writes the binary representation of the `BitArray` to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to which the binary data will be written. + /// + /// # Errors + /// + /// Returns an error if there was a problem writing to the writer. fn write_binary(&self, mut writer: W) -> Result<()> { for value in self.data.iter() { writer.write_all(&value.to_le_bytes())?; @@ -16,6 +48,15 @@ impl Binary for BitArray { Ok(()) } + /// Reads the binary representation of the `BitArray` from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader from which the binary data will be read. + /// + /// # Errors + /// + /// Returns an error if there was a problem reading from the reader. fn read_binary(&mut self, mut reader: R) -> Result<()> { self.data.clear(); @@ -36,6 +77,17 @@ impl Binary for BitArray { } } +/// Fills the buffer with data read from the input. +/// +/// # Arguments +/// +/// * `input` - The input source to read data from. +/// * `buffer` - The buffer to fill with data. +/// +/// # Returns +/// +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input is reached, +/// and `bytes_read` is the number of bytes read into the buffer. fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // Store the buffer size in advance, because rust will complain // about the buffer being borrowed mutably while it's borrowed @@ -71,6 +123,14 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { mod tests { use super::*; + pub struct ErrorInput; + + impl Read for ErrorInput { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + Err(std::io::Error::new(std::io::ErrorKind::Other, "read error")) + } + } + #[test] fn test_fill_buffer() { let input_str = "a".repeat(8_000); @@ -90,6 +150,15 @@ mod tests { } } + #[test] + #[should_panic] + fn test_fill_buffer_read_error() { + let mut input = ErrorInput; + let mut buffer = vec![0; 800]; + + fill_buffer(&mut input, &mut buffer); + } + #[test] fn test_write_binary() { let mut bitarray = BitArray::<40>::with_capacity(4); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 5a721dc..393bab3 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -1,12 +1,30 @@ -pub mod binary; +//! This module contains the `BitArray` struct and its associated methods. +mod binary; + +/// Re-export the `Binary` trait. +pub use binary::Binary; + +/// A fixed-size bit array implementation. pub struct BitArray { - pub data: Vec, - pub mask: u64, - pub len: usize, + /// The underlying data storage for the bit array. + data: Vec, + /// The mask used to extract the relevant bits from each element in the data vector. + mask: u64, + /// The length of the bit array. + len: usize, } impl BitArray { + /// Creates a new `BitArray` with the specified capacity. + /// + /// # Arguments + /// + /// * `capacity` - The number of bits the `BitArray` can hold. + /// + /// # Returns + /// + /// A new `BitArray` with the specified capacity. pub fn with_capacity(capacity: usize) -> Self { Self { data: vec![0; capacity * B / 64 + 1], @@ -15,29 +33,54 @@ impl BitArray { } } + /// Retrieves the value at the specified index in the `BitArray`. + /// + /// # Arguments + /// + /// * `index` - The index of the value to retrieve. + /// + /// # Returns + /// + /// The value at the specified index. pub fn get(&self, index: usize) -> u64 { let start_block = index * B / 64; let start_block_offset = index * B % 64; + // If the value is contained within a single block if start_block_offset + B <= 64 { + // Shift the value to the right so that the relevant bits are in the least significant position + // Then mask out the irrelevant bits return self.data[start_block] >> (64 - start_block_offset - B) & self.mask; } let end_block = (index + 1) * B / 64; let end_block_offset = (index + 1) * B % 64; + // Extract the relevant bits from the start block and shift them {end_block_offset} bits to the left let a = self.data[start_block] << end_block_offset; + + // Extract the relevant bits from the end block and shift them to the least significant position let b = self.data[end_block] >> (64 - end_block_offset); + // Paste the two values together and mask out the irrelevant bits (a | b) & self.mask } + /// Sets the value at the specified index in the `BitArray`. + /// + /// # Arguments + /// + /// * `index` - The index of the value to set. + /// * `value` - The value to set at the specified index. pub fn set(&mut self, index: usize, value: u64) { let start_block = index * B / 64; let start_block_offset = index * B % 64; + // If the value is contained within a single block if start_block_offset + B <= 64 { + // Clear the relevant bits in the start block self.data[start_block] &= !(self.mask << (64 - start_block_offset - B)); + // Set the relevant bits in the start block self.data[start_block] |= value << (64 - start_block_offset - B); return; } @@ -45,13 +88,22 @@ impl BitArray { let end_block = (index + 1) * B / 64; let end_block_offset = (index + 1) * B % 64; + // Clear the relevant bits in the start block self.data[start_block] &= !(self.mask >> start_block_offset); + // Set the relevant bits in the start block self.data[start_block] |= value >> end_block_offset; + // Clear the relevant bits in the end block self.data[end_block] &= !(self.mask << (64 - end_block_offset)); + // Set the relevant bits in the end block self.data[end_block] |= value << (64 - end_block_offset); } + /// Returns the length of the `BitArray`. + /// + /// # Returns + /// + /// The length of the `BitArray`. pub fn len(&self) -> usize { self.len } From bbe0bc71528a091ba26f9b8d7c6e5219d295a253 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 16 May 2024 15:55:44 +0200 Subject: [PATCH 04/26] cargo formatting + better panic message in testing --- bitarray/src/binary.rs | 41 ++++++++++++++++++++++++----------------- bitarray/src/lib.rs | 22 ++++++++++++---------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index 67a7be4..7a4e272 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -1,6 +1,11 @@ //! This module provides utilities for reading and writing the bitarray as binary. -use std::io::{BufRead, Read, Result, Write}; +use std::io::{ + BufRead, + Read, + Result, + Write +}; use crate::BitArray; @@ -61,11 +66,12 @@ impl Binary for BitArray { self.data.clear(); let mut buffer = vec![0; 8 * 1024]; - + loop { let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); - for buffer_slice in buffer[..bytes_read].chunks_exact(8) { - self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); + for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { + self.data + .push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); } if finished { @@ -86,8 +92,8 @@ impl Binary for BitArray { /// /// # Returns /// -/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input is reached, -/// and `bytes_read` is the number of bytes read into the buffer. +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input +/// is reached, and `bytes_read` is the number of bytes read into the buffer. fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // Store the buffer size in advance, because rust will complain // about the buffer being borrowed mutably while it's borrowed @@ -109,7 +115,7 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // We've read {bytes_read} bytes Ok(bytes_read) => { // Shrink the writable buffer slice - writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); } Err(err) => { @@ -137,7 +143,7 @@ mod tests { let mut input = input_str.as_bytes(); let mut buffer = vec![0; 800]; - + loop { let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); @@ -151,7 +157,7 @@ mod tests { } #[test] - #[should_panic] + #[should_panic(expected = "Error while reading input:")] fn test_fill_buffer_read_error() { let mut input = ErrorInput; let mut buffer = vec![0; 800]; @@ -170,19 +176,20 @@ mod tests { let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); - assert_eq!(buffer, vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, - 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ]); + assert_eq!( + buffer, + vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ] + ); } #[test] fn test_read_binary() { let buffer = vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, - 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0, ]; let mut bitarray = BitArray::<40>::with_capacity(4); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 393bab3..6fa4d4d 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -12,7 +12,7 @@ pub struct BitArray { /// The mask used to extract the relevant bits from each element in the data vector. mask: u64, /// The length of the bit array. - len: usize, + len: usize } impl BitArray { @@ -29,7 +29,7 @@ impl BitArray { Self { data: vec![0; capacity * B / 64 + 1], mask: (1 << B) - 1, - len: capacity, + len: capacity } } @@ -48,18 +48,20 @@ impl BitArray { // If the value is contained within a single block if start_block_offset + B <= 64 { - // Shift the value to the right so that the relevant bits are in the least significant position - // Then mask out the irrelevant bits + // Shift the value to the right so that the relevant bits are in the least significant + // position Then mask out the irrelevant bits return self.data[start_block] >> (64 - start_block_offset - B) & self.mask; } let end_block = (index + 1) * B / 64; let end_block_offset = (index + 1) * B % 64; - // Extract the relevant bits from the start block and shift them {end_block_offset} bits to the left + // Extract the relevant bits from the start block and shift them {end_block_offset} bits to + // the left let a = self.data[start_block] << end_block_offset; - // Extract the relevant bits from the end block and shift them to the least significant position + // Extract the relevant bits from the end block and shift them to the least significant + // position let b = self.data[end_block] >> (64 - end_block_offset); // Paste the two values together and mask out the irrelevant bits @@ -116,7 +118,7 @@ mod tests { #[test] fn test_bitarray_with_capacity() { let bitarray = BitArray::<40>::with_capacity(4); - assert_eq!(bitarray.data, vec![ 0, 0, 0 ]); + assert_eq!(bitarray.data, vec![0, 0, 0]); assert_eq!(bitarray.mask, 0xff_ffff_ffff); assert_eq!(bitarray.len, 4); } @@ -124,7 +126,7 @@ mod tests { #[test] fn test_bitarray_get() { let mut bitarray = BitArray::<40>::with_capacity(4); - bitarray.data = vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4 ]; + bitarray.data = vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4]; assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010); assert_eq!(bitarray.get(1), 0b1100001001010010011000010100110111001001); @@ -141,7 +143,7 @@ mod tests { bitarray.set(2, 0b1111001101001101101101101011101001010001); bitarray.set(3, 0b0000100010010001010001001110101110011100); - assert_eq!(bitarray.data, vec![ 0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000 ]); + assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]); } #[test] @@ -149,4 +151,4 @@ mod tests { let bitarray = BitArray::<40>::with_capacity(4); assert_eq!(bitarray.len(), 4); } -} \ No newline at end of file +} From 3a426fe8f7d8f96ee20a04411b9f3ef0c320e981 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 16 May 2024 15:59:49 +0200 Subject: [PATCH 05/26] add is_empty method for clippy --- bitarray/src/lib.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 6fa4d4d..c8804b2 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -109,6 +109,15 @@ impl BitArray { pub fn len(&self) -> usize { self.len } + + /// Checks if the `BitArray` is empty. + /// + /// # Returns + /// + /// `true` if the `BitArray` is empty, `false` otherwise. + pub fn is_empty(&self) -> bool { + self.len == 0 + } } #[cfg(test)] @@ -151,4 +160,16 @@ mod tests { let bitarray = BitArray::<40>::with_capacity(4); assert_eq!(bitarray.len(), 4); } + + #[test] + fn test_bitarray_is_empty() { + let bitarray = BitArray::<40>::with_capacity(0); + assert!(bitarray.is_empty()); + } + + #[test] + fn test_bitarray_is_not_empty() { + let bitarray = BitArray::<40>::with_capacity(4); + assert!(!bitarray.is_empty()); + } } From 768d128f419cc24ab5a7e8b2e2cfdd07150f1010 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 16 May 2024 16:01:12 +0200 Subject: [PATCH 06/26] cargo formatting --- bitarray/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index c8804b2..48dd0a9 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -111,9 +111,9 @@ impl BitArray { } /// Checks if the `BitArray` is empty. - /// + /// /// # Returns - /// + /// /// `true` if the `BitArray` is empty, `false` otherwise. pub fn is_empty(&self) -> bool { self.len == 0 From 7dd5c4b4d616eecea69acad1b36cbb1fa61ec06a Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 17 May 2024 11:13:23 +0200 Subject: [PATCH 07/26] function to compress data without having to store everything in memory --- bitarray/src/binary.rs | 43 +++++++++++-------------- bitarray/src/lib.rs | 72 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 25 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index 7a4e272..36c8005 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -1,11 +1,6 @@ //! This module provides utilities for reading and writing the bitarray as binary. -use std::io::{ - BufRead, - Read, - Result, - Write -}; +use std::io::{BufRead, Read, Result, Write}; use crate::BitArray; @@ -20,7 +15,7 @@ pub trait Binary { /// # Returns /// /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. - fn write_binary(&self, writer: W) -> Result<()>; + fn write_binary(&self, writer: &mut W) -> Result<()>; /// Reads binary data into a struct from the given reader. /// @@ -45,7 +40,7 @@ impl Binary for BitArray { /// # Errors /// /// Returns an error if there was a problem writing to the writer. - fn write_binary(&self, mut writer: W) -> Result<()> { + fn write_binary(&self, writer: &mut W) -> Result<()> { for value in self.data.iter() { writer.write_all(&value.to_le_bytes())?; } @@ -66,12 +61,11 @@ impl Binary for BitArray { self.data.clear(); let mut buffer = vec![0; 8 * 1024]; - + loop { let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); - for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { - self.data - .push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); } if finished { @@ -92,8 +86,8 @@ impl Binary for BitArray { /// /// # Returns /// -/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input -/// is reached, and `bytes_read` is the number of bytes read into the buffer. +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input is reached, +/// and `bytes_read` is the number of bytes read into the buffer. fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // Store the buffer size in advance, because rust will complain // about the buffer being borrowed mutably while it's borrowed @@ -115,7 +109,7 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // We've read {bytes_read} bytes Ok(bytes_read) => { // Shrink the writable buffer slice - writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); } Err(err) => { @@ -143,7 +137,7 @@ mod tests { let mut input = input_str.as_bytes(); let mut buffer = vec![0; 800]; - + loop { let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); @@ -176,20 +170,19 @@ mod tests { let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); - assert_eq!( - buffer, - vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, - 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ] - ); + assert_eq!(buffer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, + 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); } #[test] fn test_read_binary() { let buffer = vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, - 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0, + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, + 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 ]; let mut bitarray = BitArray::<40>::with_capacity(4); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 48dd0a9..740e8b1 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -2,6 +2,8 @@ mod binary; +use std::io::{Write, Result}; + /// Re-export the `Binary` trait. pub use binary::Binary; @@ -118,6 +120,62 @@ impl BitArray { pub fn is_empty(&self) -> bool { self.len == 0 } + + /// Clears the `BitArray`, setting all bits to 0. + pub fn clear(&mut self) { + self.data.iter_mut().for_each(|x| *x = 0); + } +} + + +/// Writes the data to a writer in a binary format using a bit array. This function is helpfull +/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the +/// specified capacity, so memory usage is minimized. +/// +/// # Arguments +/// +/// * `data` - The data to write. +/// * `writer` - The writer to write the data to. +/// * `max_capacity` - The maximum amount of elements that may be stored in the bit array. +/// +/// # Returns +/// +/// A `Result` indicating whether the write operation was successful or not. +pub fn data_to_writer( + data: Vec, + writer: &mut impl Write, + max_capacity: usize +) -> Result<()> { + // Calculate the capacity of the bit array so the data buffer can be stored entirely + // This makes the process of writing partial data to the writer easier as bounds checking is not needed + let capacity = max_capacity % (B * 64) * B * 64; + + // Create a bit array that can store a single chunk of data + let mut bitarray = BitArray::::with_capacity(capacity); + + // Write the data to the writer in chunks of the specified capacity + let chunks = data.chunks_exact(capacity); + + // Store the remainder before looping over the chunks + let remainder = chunks.remainder(); + + for chunk in chunks { + for (i, &value) in chunk.iter().enumerate() { + bitarray.set(i, value as u64); + } + bitarray.write_binary(writer)?; + bitarray.clear(); + } + + // Create a new bit array with the remainder capacity + bitarray = BitArray::::with_capacity(remainder.len()); + + for (i, &value) in remainder.iter().enumerate() { + bitarray.set(i, value as u64); + } + bitarray.write_binary(writer)?; + + Ok(()) } #[cfg(test)] @@ -172,4 +230,18 @@ mod tests { let bitarray = BitArray::<40>::with_capacity(4); assert!(!bitarray.is_empty()); } + + #[test] + fn test_data_to_writer() { + let data = vec![0x1234567890, 0xabcdef0123, 0x4567890abc, 0xdef0123456]; + let mut writer = Vec::new(); + + data_to_writer::<40>(data, &mut writer, 2).unwrap(); + + assert_eq!(writer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, + 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); + } } From df85796f4b0f4ec0762b0ad50e26ffe22362d964 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 17 May 2024 17:45:34 +0200 Subject: [PATCH 08/26] compressed integration + some major refactoring --- Cargo.lock | 12 +- Cargo.toml | 2 +- bitarray/src/lib.rs | 5 +- sa-builder/Cargo.toml | 2 + sa-builder/src/binary.rs | 149 ---------------------- sa-builder/src/lib.rs | 49 +++++--- sa-builder/src/main.rs | 75 ++++++----- sa-compression/Cargo.toml | 9 ++ sa-compression/src/lib.rs | 92 ++++++++++++++ sa-index/Cargo.toml | 2 +- sa-index/src/binary.rs | 149 ++++++++++++++++++++++ sa-index/src/lib.rs | 240 +++--------------------------------- sa-index/src/main.rs | 13 -- sa-index/src/sa_searcher.rs | 12 +- sa-index/src/util.rs | 63 ---------- sa-server/Cargo.toml | 3 +- sa-server/src/main.rs | 34 ++++- 17 files changed, 395 insertions(+), 516 deletions(-) delete mode 100644 sa-builder/src/binary.rs create mode 100644 sa-compression/Cargo.toml create mode 100644 sa-compression/src/lib.rs create mode 100644 sa-index/src/binary.rs delete mode 100644 sa-index/src/main.rs delete mode 100644 sa-index/src/util.rs diff --git a/Cargo.lock b/Cargo.lock index 56db76a..3f82911 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1412,16 +1412,25 @@ dependencies = [ "clap 4.5.4", "libdivsufsort-rs", "libsais64-rs", + "sa-compression", + "sa-index", "sa-mappings", ] +[[package]] +name = "sa-compression" +version = "0.1.0" +dependencies = [ + "bitarray", +] + [[package]] name = "sa-index" version = "0.1.0" dependencies = [ + "bitarray", "clap 4.5.4", "rayon", - "sa-builder", "sa-mappings", "serde", "serde_json", @@ -1447,6 +1456,7 @@ dependencies = [ "axum", "clap 4.5.4", "sa-builder", + "sa-compression", "sa-index", "sa-mappings", "serde", diff --git a/Cargo.toml b/Cargo.toml index 617ad70..fec447c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ resolver = "2" members = [ "bitarray", "fa-compression", "libsais64-rs", - "sa-builder", + "sa-builder", "sa-compression", "sa-index", "sa-mappings", "sa-server" diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 740e8b1..e0014d3 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -2,7 +2,7 @@ mod binary; -use std::io::{Write, Result}; +use std::{io::{Result, Write}, ops::Index}; /// Re-export the `Binary` trait. pub use binary::Binary; @@ -127,7 +127,6 @@ impl BitArray { } } - /// Writes the data to a writer in a binary format using a bit array. This function is helpfull /// when writing large amounts of data to a writer in chunks. The data is written in chunks of the /// specified capacity, so memory usage is minimized. @@ -148,7 +147,7 @@ pub fn data_to_writer( ) -> Result<()> { // Calculate the capacity of the bit array so the data buffer can be stored entirely // This makes the process of writing partial data to the writer easier as bounds checking is not needed - let capacity = max_capacity % (B * 64) * B * 64; + let capacity = max_capacity / (B * 64) * B * 64; // Create a bit array that can store a single chunk of data let mut bitarray = BitArray::::with_capacity(capacity); diff --git a/sa-builder/Cargo.toml b/sa-builder/Cargo.toml index 9937384..cb7d553 100644 --- a/sa-builder/Cargo.toml +++ b/sa-builder/Cargo.toml @@ -10,3 +10,5 @@ clap = { version = "4.4.8", features = ["derive"] } libsais64-rs = { path = "../libsais64-rs" } libdivsufsort-rs = "0.1.0" sa-mappings = { path = "../sa-mappings" } +sa-compression = { path = "../sa-compression" } +sa-index = { path = "../sa-index" } diff --git a/sa-builder/src/binary.rs b/sa-builder/src/binary.rs deleted file mode 100644 index ad417d3..0000000 --- a/sa-builder/src/binary.rs +++ /dev/null @@ -1,149 +0,0 @@ -use std::{ - cmp::min, - error::Error, - fs::{ - File, - OpenOptions - }, - io::{ - Read, - Write - } -}; - -const ONE_GIB: usize = 2usize.pow(30); - -/// Trait implemented by structs that are binary serializable -/// In our case this is will be a [i64] since the suffix array is a Vec -pub trait Serializable { - /// Serializes self into a vector of bytes - /// - /// # Returns - /// - /// Returns a vector of bytes - fn serialize(&self) -> Vec; -} - -impl Serializable for [i64] { - fn serialize(&self) -> Vec { - let mut res = vec![]; - self.iter() - .for_each(|entry| res.extend_from_slice(&entry.to_le_bytes())); - res - } -} - -/// Deserializes a vector of bytes into the suffix array -/// -/// # Arguments -/// * `data` - The raw bytes needed to be serialized into a suffix array -/// -/// # Returns -/// -/// Returns the suffix array, a Vec -fn deserialize_sa(data: &[u8]) -> Vec { - let mut res = vec![]; - if data.len() % 8 != 0 { - panic!("Serialized data is not a multiple of 8 bytes long!") - } - for start in (0 .. data.len()).step_by(8) { - res.push(i64::from_le_bytes(data[start .. start + 8].try_into().unwrap())); - } - res -} - -/// Writes the given suffix array with the `sparseness_factor` factor to the given file -/// -/// # Arguments -/// * `sparseness_factor` - The sparseness factor of the suffix array -/// * `suffix_array` - The suffix array -/// * `filename` - The name of the file we want to write the suffix array to -/// -/// # Returns -/// -/// Returns () if writing away the suffix array succeeded -/// -/// # Errors -/// -/// Returns an io::Error if writing away the suffix array failed -pub fn write_suffix_array( - sparseness_factor: u8, - suffix_array: &[i64], - filename: &str -) -> Result<(), std::io::Error> { - // create the file - let mut f = OpenOptions::new() - .create(true) - .write(true) - .truncate(true) // if the file already exists, empty the file - .open(filename)?; - f.write_all(&[sparseness_factor])?; // write the sample rate as the first byte - - // write 1 GiB at a time, to minimize extra used memory since we need to translate i64 to [u8; - // 8] - let sa_len = suffix_array.len(); - for start_index in (0 .. sa_len).step_by(ONE_GIB / 8) { - let end_index = min(start_index + ONE_GIB / 8, sa_len); - f.write_all(&suffix_array[start_index .. end_index].serialize())?; - } - - Ok(()) -} - -/// Loads the suffix array from the file with the given `filename` -/// -/// # Arguments -/// * `filename` - The filename of the file where the suffix array is stored -/// -/// # Returns -/// -/// Returns the sample rate of the suffix array, together with the suffix array -/// -/// # Errors -/// -/// Returns any error from opening the file or reading the file -pub fn load_suffix_array(filename: &str) -> Result<(u8, Vec), Box> { - let mut file = &File::open(filename)?; - let mut sparseness_factor_buffer = [0_u8; 1]; - file.read_exact(&mut sparseness_factor_buffer) - .map_err(|_| "Could not read the sample rate from the binary file")?; - let sparseness_factor = sparseness_factor_buffer[0]; - - let mut sa = vec![]; - loop { - let mut buffer = vec![]; - // use take in combination with read_to_end to ensure that the buffer will be completely - // filled (except when the file is smaller than the buffer) - let count = file.take(ONE_GIB as u64).read_to_end(&mut buffer)?; - if count == 0 { - break; - } - sa.extend_from_slice(&deserialize_sa(&buffer[.. count])); - } - - Ok((sparseness_factor, sa)) -} - -#[cfg(test)] -mod tests { - use crate::binary::{ - deserialize_sa, - Serializable - }; - - #[test] - fn test_serialize_deserialize() { - let data: Vec = vec![5, 2165487362, -12315135]; - let serialized = data.serialize(); - let deserialized = deserialize_sa(serialized.as_ref()); - assert_eq!(data, deserialized); - } - - #[test] - fn test_serialize_deserialize_empty() { - let data: Vec = vec![]; - let serialized = data.serialize(); - let deserialized = deserialize_sa(serialized.as_ref()); - assert_eq!(data, deserialized); - } -} diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 65092e5..29097c9 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,5 +1,3 @@ -pub mod binary; - use std::error::Error; use clap::{ @@ -25,7 +23,10 @@ pub struct Arguments { #[arg(long, default_value_t = 1)] pub sparseness_factor: u8, #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] - pub construction_algorithm: SAConstructionAlgorithm + pub construction_algorithm: SAConstructionAlgorithm, + /// If the suffix array should be compressed (default value true) + #[arg(long, default_value_t = true)] + pub compress_sa: bool } /// Enum representing the two possible algorithms to construct the suffix array @@ -49,18 +50,15 @@ pub enum SAConstructionAlgorithm { /// # Errors /// /// The errors that occurred during the building of the suffix array itself -pub fn build_sa( +pub fn build_ssa( data: &mut Vec, construction_algorithm: &SAConstructionAlgorithm, sparseness_factor: u8 ) -> Result, Box> { // translate all L's to a I - for character in data.iter_mut() { - if *character == b'L' { - *character = b'I' - } - } + translate_l_to_i(data); + // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(data), SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(data) @@ -69,17 +67,30 @@ pub fn build_sa( // make the SA sparse and decrease the vector size if we have sampling (== sampling_rate > 1) if sparseness_factor > 1 { - let mut current_sampled_index = 0; - for i in 0 .. sa.len() { - let current_sa_val = sa[i]; - if current_sa_val % sparseness_factor as i64 == 0 { - sa[current_sampled_index] = current_sa_val; - current_sampled_index += 1; - } - } - // make shorter - sa.resize(current_sampled_index, 0); + sample_sa(&mut sa, sparseness_factor) } Ok(sa) } + +fn translate_l_to_i(data: &mut Vec) { + for character in data.iter_mut() { + if *character == b'L' { + *character = b'I' + } + } +} + +fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { + let mut current_sampled_index = 0; + for i in 0 .. sa.len() { + let current_sa_val = sa[i]; + if current_sa_val % sparseness_factor as i64 == 0 { + sa[current_sampled_index] = current_sa_val; + current_sampled_index += 1; + } + } + + // make shorter + sa.resize(current_sampled_index, 0); +} diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index e651673..9da2b99 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,9 +1,12 @@ +use std::{fs::{File, OpenOptions}, io::Result}; + use clap::Parser; use sa_builder::{ - binary::write_suffix_array, - build_sa, + build_ssa, Arguments }; +use sa_index::binary::dump_suffix_array; +use sa_compression::dump_compressed_suffix_array; use sa_mappings::{ proteins::Proteins, taxonomy::{ @@ -13,42 +16,54 @@ use sa_mappings::{ }; fn main() { - let args = Arguments::parse(); let Arguments { database_file, taxonomy, output, sparseness_factor, - construction_algorithm - } = args; - let taxon_id_calculator = - TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar); - if let Err(err) = taxon_id_calculator { - eprintln!("{}", err); - std::process::exit(1); - } + construction_algorithm, + compress_sa + } = Arguments::parse(); - let taxon_id_calculator = taxon_id_calculator.unwrap(); + let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar).unwrap_or_else( + |err| eprint_and_exit(err.to_string().as_str()) + ); // read input - let data = - Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator); - if let Err(err) = data { - eprintln!("{}", err); - std::process::exit(1); - } - let mut data = data.unwrap(); - // calculate sa - let sa = build_sa(&mut data, &construction_algorithm, sparseness_factor); - if let Err(err) = sa { - eprintln!("{}", err); - std::process::exit(1); + let mut data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator).unwrap_or_else( + |err| eprint_and_exit(err.to_string().as_str()) + ); + + // calculate sparse suffix array + let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor).unwrap_or_else( + |err| eprint_and_exit(err.to_string().as_str()) + ); + + // open the output file + let mut file = open_file(&output).unwrap_or_else( + |err| eprint_and_exit(err.to_string().as_str()) + ); + + if compress_sa { + if let Err(err) = dump_compressed_suffix_array::<37>(sa, sparseness_factor, &mut file) { + eprint_and_exit(err.to_string().as_str()); + }; + } else { + if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { + eprint_and_exit(err.to_string().as_str()); + }; } - let sa = sa.unwrap(); +} + +fn open_file(file: &str) -> Result { + OpenOptions::new() + .create(true) + .write(true) + .truncate(true) // if the file already exists, empty the file + .open(file) +} - // output the build SA - if let Err(err) = write_suffix_array(sparseness_factor, &sa, &output) { - eprintln!("{}", err); - std::process::exit(1); - }; +fn eprint_and_exit(err: &str) -> ! { + eprintln!("{}", err); + std::process::exit(1); } diff --git a/sa-compression/Cargo.toml b/sa-compression/Cargo.toml new file mode 100644 index 0000000..a53939b --- /dev/null +++ b/sa-compression/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "sa-compression" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitarray = { path = "../bitarray" } diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs new file mode 100644 index 0000000..4ee9abf --- /dev/null +++ b/sa-compression/src/lib.rs @@ -0,0 +1,92 @@ +use std::{error::Error, io::{BufRead, Write}}; + +use bitarray::{data_to_writer, Binary, BitArray}; + +pub fn dump_compressed_suffix_array( + sa: Vec, + sparseness_factor: u8, + writer: &mut impl Write, +) -> Result<(), Box> { + // Write the flags to the writer + // 00000001 indicates that the suffix array is compressed + writer.write(&[1_u8]).map_err(|_| "Could not write the flags to the writer")?; + + // Write the sparseness factor to the writer + writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; + + // Write the size of the suffix array to the writer + writer.write(&(sa.len() as u64).to_le_bytes()).map_err(|_| "Could not write the size of the suffix array to the writer")?; + + // Compress the suffix array and write it to the writer + data_to_writer::(sa, writer, 8 * 1024).map_err(|_| "Could not write the compressed suffix array to the writer")?; + + Ok(()) +} + +pub fn load_compressed_suffix_array( + reader: &mut impl BufRead, +) -> Result<(u8, BitArray), Box> { + // Read the sample rate from the binary file (1 byte) + let mut sample_rate_buffer = [0_u8; 1]; + reader.read_exact(&mut sample_rate_buffer).map_err(|_| "Could not read the sample rate from the binary file")?; + let sample_rate = sample_rate_buffer[0]; + + // Read the size of the suffix array from the binary file (8 bytes) + let mut size_buffer = [0_u8; 8]; + reader.read_exact(&mut size_buffer).map_err(|_| "Could not read the size of the suffix array from the binary file")?; + let size = u64::from_le_bytes(size_buffer) as usize; + + // Read the compressed suffix array from the binary file + let mut compressed_suffix_array = BitArray::::with_capacity(size); + compressed_suffix_array.read_binary(reader).map_err(|_| "Could not read the compressed suffix array from the binary file")?; + + Ok((sample_rate, compressed_suffix_array)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_dump_compressed_suffix_array() { + let sa = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + + let mut writer = vec![]; + dump_compressed_suffix_array::<8>(sa, 1, &mut writer).unwrap(); + + assert_eq!(writer, vec![ + // flags + 1, + // sparseness factor + 1, + // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, + // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, + 0, 0, 0, 0, 0, 0, 10, 9 + ]); + } + + #[test] + fn test_load_compressed_suffix_array() { + let data = vec![ + // flags + 1, + // sparseness factor + 1, + // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, + // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, + 0, 0, 0, 0, 0, 0, 10, 9 + ]; + + let mut reader = std::io::BufReader::new(&data[..]); + let (sample_rate, compressed_suffix_array) = load_compressed_suffix_array::<8>(&mut reader).unwrap(); + + assert_eq!(sample_rate, 1); + for i in 0..10 { + assert_eq!(compressed_suffix_array.get(i), i as u64 + 1); + } + } +} diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index c355bef..986dc3c 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -10,6 +10,6 @@ clap = { version = "4.4.8", features = ["derive"] } umgap = "1.1.0" rayon = "1.8.1" serde = { version = "1.0.197", features = ["derive"] } -sa-builder = { path = "../sa-builder" } sa-mappings = { path = "../sa-mappings" } +bitarray = { path = "../bitarray" } serde_json = "1.0.116" diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs new file mode 100644 index 0000000..a86c382 --- /dev/null +++ b/sa-index/src/binary.rs @@ -0,0 +1,149 @@ +use std::{error::Error, io::{BufRead, Read, Write}}; + +/// The `Binary` trait provides methods for reading and writing a struct as binary. +pub trait Binary { + /// Writes the struct as binary to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. + fn write_binary(&self, writer: &mut W) -> std::io::Result<()>; + + /// Reads binary data into a struct from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader to read the binary data from. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `Err` if an error occurs. + fn read_binary(&mut self, reader: R) -> std::io::Result<()>; +} + +impl Binary for Vec { + fn write_binary(&self, writer: &mut W) -> std::io::Result<()> { + for value in self { + writer.write_all(&value.to_le_bytes())?; + } + + Ok(()) + } + + fn read_binary(&mut self, mut reader: R) -> std::io::Result<()> { + self.clear(); + + let mut buffer = vec![0; 8 * 1024]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); + } + + if finished { + break; + } + } + + Ok(()) + } +} + +/// Loads the suffix array from the file with the given `filename` +/// +/// # Arguments +/// * `filename` - The filename of the file where the suffix array is stored +/// +/// # Returns +/// +/// Returns the sample rate of the suffix array, together with the suffix array +/// +/// # Errors +/// +/// Returns any error from opening the file or reading the file +pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Box> { + // Read the sample rate from the binary file (1 byte) + let mut sample_rate_buffer = [0_u8; 1]; + reader.read_exact(&mut sample_rate_buffer).map_err(|_| "Could not read the sample rate from the binary file")?; + let sample_rate = sample_rate_buffer[0]; + + // Read the size of the suffix array from the binary file (8 bytes) + let mut size_buffer = [0_u8; 8]; + reader.read_exact(&mut size_buffer).map_err(|_| "Could not read the size of the suffix array from the binary file")?; + let size = u64::from_le_bytes(size_buffer) as usize; + + let mut sa = Vec::with_capacity(size); + sa.read_binary(reader).map_err(|_| "Could not read the suffix array from the binary file")?; + + Ok((sample_rate, sa)) +} + +pub fn dump_suffix_array( + sa: &Vec, + sparseness_factor: u8, + writer: &mut impl Write, +) -> Result<(), Box> { + // Write the flags to the writer + // 00000000 indicates that the suffix array is not compressed + writer.write(&[0_u8]).map_err(|_| "Could not write the flags to the writer")?; + + // Write the sparseness factor to the writer + writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; + + // Write the size of the suffix array to the writer + let sa_len = sa.len(); + writer.write(&(sa_len).to_le_bytes()).map_err(|_| "Could not write the size of the suffix array to the writer")?; + + // Write the suffix array to the writer + let sa = Vec::with_capacity(sa_len); + sa.write_binary(writer).map_err(|_| "Could not write the suffix array to the writer")?; + + Ok(()) +} + +/// Fills the buffer with data read from the input. +/// +/// # Arguments +/// +/// * `input` - The input source to read data from. +/// * `buffer` - The buffer to fill with data. +/// +/// # Returns +/// +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input is reached, +/// and `bytes_read` is the number of bytes read into the buffer. +fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { + // Store the buffer size in advance, because rust will complain + // about the buffer being borrowed mutably while it's borrowed + let buffer_size = buffer.len(); + + let mut writable_buffer_space = buffer.as_mut(); + + loop { + match input.read(writable_buffer_space) { + // No bytes written, which means we've completely filled the buffer + // or we've reached the end of the file + Ok(0) => { + return ( + !writable_buffer_space.is_empty(), + buffer_size - writable_buffer_space.len() + ); + } + + // We've read {bytes_read} bytes + Ok(bytes_read) => { + // Shrink the writable buffer slice + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + } + + Err(err) => { + panic!("Error while reading input: {}", err); + } + } + } +} diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index b017880..2958c16 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -1,239 +1,33 @@ -use std::{ - error::Error, - num::NonZeroUsize -}; - -use clap::{ - arg, - Parser, - ValueEnum -}; -use sa_builder::{ - binary::{ - load_suffix_array, - write_suffix_array - }, - build_sa, - SAConstructionAlgorithm -}; -use sa_mappings::{ - functionality::FunctionAggregator, - proteins::Proteins, - taxonomy::{ - AggregationMethod, - TaxonAggregator - } -}; - -use crate::{ - peptide_search::{ - analyse_all_peptides, - search_all_peptides - }, - sa_searcher::Searcher, - suffix_to_protein_index::{ - DenseSuffixToProtein, - SparseSuffixToProtein, - SuffixToProteinIndex, - SuffixToProteinMappingStyle - }, - util::{ - get_time_ms, - read_lines - } -}; +use bitarray::BitArray; +pub mod binary; pub mod peptide_search; pub mod sa_searcher; pub mod suffix_to_protein_index; -pub mod util; -/// Enum that represents the 2 kinds of search that are supported -#[derive(ValueEnum, Clone, Debug, PartialEq)] -pub enum SearchMode { - Search, - Analysis +pub enum SuffixArray { + Original(Vec), + Compressed(BitArray<37>) } -/// Enum that represents all possible commandline arguments -#[derive(Parser, Debug)] -pub struct Arguments { - /// File with the proteins used to build the suffix tree. All the proteins are expected to be - /// concatenated using a `#`. - #[arg(short, long)] - database_file: String, - #[arg(short, long)] - search_file: Option, - #[arg(short, long)] - /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. - taxonomy: String, - /// This will only build the tree and stop after that is completed. Used during benchmarking. - #[arg(long)] - build_only: bool, - /// Output file to store the built index. - #[arg(short, long)] - output: Option, - /// The sparseness factor used on the suffix array (default value 1, which means every value in - /// the SA is used) - #[arg(long, default_value_t = 1)] - sparseness_factor: u8, - /// Set the style used to map back from the suffix to the protein. 2 options or - /// . Dense is default Dense uses O(n) memory with n the size of the input text, and - /// takes O(1) time to find the mapping Sparse uses O(m) memory with m the number of - /// proteins, and takes O(log m) to find the mapping - #[arg(long, value_enum, default_value_t = SuffixToProteinMappingStyle::Sparse)] - suffix_to_protein_mapping: SuffixToProteinMappingStyle, - #[arg(long)] - load_index: Option, - #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] - construction_algorithm: SAConstructionAlgorithm, - /// Assume the resulting taxon ID is root (1) whenever a peptide matches >= cutoff proteins - #[arg(long, default_value_t = 10000)] - cutoff: usize, - #[arg(long)] - threads: Option, - #[arg(long)] - equalize_i_and_l: bool, - #[arg(long)] - clean_taxa: bool, - #[arg(long, value_enum, default_value_t = SearchMode::Analysis)] - search_mode: SearchMode -} - -/// Run the suffix array program -/// -/// # Arguments -/// * `args` - The commandline arguments provided to the program -/// -/// # Returns -/// -/// Returns Unit -/// -/// # Errors -/// -/// Returns all possible errors that occurred during the program -pub fn run(mut args: Arguments) -> Result<(), Box> { - let taxon_id_calculator = - TaxonAggregator::try_from_taxonomy_file(&args.taxonomy, AggregationMethod::LcaStar)?; - - let sa = match &args.load_index { - // load SA from file - Some(index_file_name) => { - let (sparseness_factor, sa) = load_suffix_array(index_file_name)?; - args.sparseness_factor = sparseness_factor; - // TODO: some kind of security check that the loaded database file and SA match - sa +impl SuffixArray { + pub fn len(&self) -> usize { + match self { + SuffixArray::Original(sa) => sa.len(), + SuffixArray::Compressed(sa) => sa.len() } - // build the SA - None => { - let protein_sequences = - Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?; - build_sa( - &mut protein_sequences.input_string.clone(), - &args.construction_algorithm, - args.sparseness_factor - )? - } - }; - - let proteins = Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?; - - if let Some(output) = &args.output { - write_suffix_array(args.sparseness_factor, &sa, output)?; } - // option that only builds the tree, but does not allow for querying (easy for benchmark - // purposes) - if args.build_only { - return Ok(()); - } - - // build the right mapping index, use box to be able to store both types in this variable - let suffix_index_to_protein: Box = - match args.suffix_to_protein_mapping { - SuffixToProteinMappingStyle::Dense => { - Box::new(DenseSuffixToProtein::new(&proteins.input_string)) - } - SuffixToProteinMappingStyle::Sparse => { - Box::new(SparseSuffixToProtein::new(&proteins.input_string)) - } - }; - - let functional_aggregator = FunctionAggregator {}; - - let searcher = Searcher::new( - sa, - args.sparseness_factor, - suffix_index_to_protein, - proteins, - taxon_id_calculator, - functional_aggregator - ); - - execute_search(&searcher, &args)?; - Ok(()) -} - -/// Execute the search using the provided programs -/// -/// # Arguments -/// * `searcher` - The Searcher which contains the protein database -/// * `args` - The arguments used to start the program -/// -/// # Returns -/// -/// Returns Unit -/// -/// # Errors -/// -/// Returns possible errors that occurred during search -fn execute_search(searcher: &Searcher, args: &Arguments) -> Result<(), Box> { - let cutoff = args.cutoff; - let search_file = args - .search_file - .as_ref() - .ok_or("No peptide file provided to search in the database")?; - - let start_time = get_time_ms()?; - let lines = read_lines(search_file)?; - let all_peptides: Vec = lines.map_while(Result::ok).collect(); - - // Explicitly set the number of threads to use if the commandline argument was set - if let Some(threads) = args.threads { - rayon::ThreadPoolBuilder::new() - .num_threads(threads.get()) - .build_global()?; - } - - match args.search_mode { - SearchMode::Search => { - let search_result = search_all_peptides( - searcher, - &all_peptides, - cutoff, - args.equalize_i_and_l, - args.clean_taxa - ); - println!("{}", serde_json::to_string(&search_result)?); - } - SearchMode::Analysis => { - let search_result = analyse_all_peptides( - searcher, - &all_peptides, - cutoff, - args.equalize_i_and_l, - args.clean_taxa - ); - println!("{}", serde_json::to_string(&search_result)?); + pub fn get(&self, index: usize) -> i64 { + match self { + SuffixArray::Original(sa) => sa[index], + SuffixArray::Compressed(sa) => sa.get(index) as i64 } } - let end_time = get_time_ms()?; - - // output to other channel to prevent integrating it into the actual output - eprintln!("Spend {} ms to search the whole file", end_time - start_time); - - Ok(()) + pub fn is_empty(&self) -> bool { + self.len() == 0 + } } /// Custom trait implemented by types that have a value that represents NULL diff --git a/sa-index/src/main.rs b/sa-index/src/main.rs deleted file mode 100644 index 8fe7fad..0000000 --- a/sa-index/src/main.rs +++ /dev/null @@ -1,13 +0,0 @@ -use clap::Parser; -use sa_index::{ - run, - Arguments -}; - -fn main() { - let args = Arguments::parse(); - if let Err(error) = run(args) { - eprintln!("{}", error); - std::process::exit(1); - }; -} diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 351e845..b4c00a5 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -19,7 +19,7 @@ use crate::{ Minimum }, suffix_to_protein_index::SuffixToProteinIndex, - Nullable + Nullable, SuffixArray }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -100,7 +100,7 @@ impl PartialEq for SearchAllSuffixesResult { /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate /// the functional analysis provided by Unipept pub struct Searcher { - sa: Vec, + sa: SuffixArray, pub sparseness_factor: u8, suffix_index_to_protein: Box, proteins: Proteins, @@ -126,7 +126,7 @@ impl Searcher { /// /// Returns a new Searcher object pub fn new( - sa: Vec, + sa: SuffixArray, sparseness_factor: u8, suffix_index_to_protein: Box, proteins: Proteins, @@ -240,7 +240,7 @@ impl Searcher { while right - left > 1 { let center = (left + right) / 2; let skip = min(lcp_left, lcp_right); - let (retval, lcp_center) = self.compare(search_string, self.sa[center], skip, bound); + let (retval, lcp_center) = self.compare(search_string, self.sa.get(center), skip, bound); found |= lcp_center == search_string.len(); @@ -258,7 +258,7 @@ impl Searcher { // handle edge case to search at index 0 if right == 1 && left == 0 { let (retval, lcp_center) = - self.compare(search_string, self.sa[0], min(lcp_left, lcp_right), bound); + self.compare(search_string, self.sa.get(0), min(lcp_left, lcp_right), bound); found |= lcp_center == search_string.len(); @@ -339,7 +339,7 @@ impl Searcher { // array (stop when our max number of matches is reached) let mut sa_index = min_bound; while sa_index < max_bound { - let suffix = self.sa[sa_index] as usize; + let suffix = self.sa.get(sa_index) as usize; // filter away matches where I was wrongfully equalized to L, and check the // unmatched prefix when I and L equalized, we only need to // check the prefix, not the whole match, when the prefix is 0, we don't need to diff --git a/sa-index/src/util.rs b/sa-index/src/util.rs deleted file mode 100644 index 2f3f467..0000000 --- a/sa-index/src/util.rs +++ /dev/null @@ -1,63 +0,0 @@ -use std::{ - fs::File, - io, - io::BufRead, - path::Path, - time::{ - SystemTime, - SystemTimeError, - UNIX_EPOCH - } -}; - -use crate::sa_searcher::Searcher; - -/// Gets the current time in ms -/// -/// # Returns -/// -/// Returns the current time in ms -/// -/// # Errors -/// -/// Returns a SystemTimeError if getting the current time somehow fails -#[allow(unused)] -pub fn get_time_ms() -> Result { - Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6) -} - -/// Times how long the function `f`, that has the searcher as only argument executed -/// -/// # Returns -/// -/// Returns the execution time of `f`in ms -/// -/// # Errors -/// -/// Returns a SystemTimeError if getting the start or end time failed -#[allow(unused)] -pub fn time_execution( - searcher: &mut Searcher, - f: &dyn Fn(&mut Searcher) -> bool -) -> Result<(bool, f64), SystemTimeError> { - let start_ms = get_time_ms()?; - let found = f(searcher); - let end_ms = get_time_ms()?; - Ok((found, end_ms - start_ms)) -} - -/// Opens `filename` and creates an iterator over it per line -/// -/// # Arguments -/// * `filename` - The file we want to iterate over per line -/// -/// # Returns -/// -/// Returns an Iterator to the Reader of the lines of the file. -pub fn read_lines

(filename: P) -> io::Result>> -where - P: AsRef -{ - let file = File::open(filename)?; - Ok(io::BufReader::new(file).lines()) -} diff --git a/sa-server/Cargo.toml b/sa-server/Cargo.toml index c1920f5..935e36b 100644 --- a/sa-server/Cargo.toml +++ b/sa-server/Cargo.toml @@ -12,4 +12,5 @@ tokio = { version = "1.36.0", features = ["rt", "rt-multi-thread", "macros"] } sa-index = { path = "../sa-index" } clap = { version = "4.5.1", features = ["derive"] } sa-builder = { path = "../sa-builder" } -sa-mappings = { path = "../sa-mappings" } \ No newline at end of file +sa-mappings = { path = "../sa-mappings" } +sa-compression = { path = "../sa-compression" } diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index 98f2981..c84cdaf 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -1,6 +1,5 @@ use std::{ - error::Error, - sync::Arc + error::Error, fs::File, io::{BufReader, Read}, sync::Arc }; use axum::{ @@ -17,8 +16,9 @@ use axum::{ Router }; use clap::Parser; -use sa_builder::binary::load_suffix_array; +use sa_compression::load_compressed_suffix_array; use sa_index::{ + binary::load_suffix_array, peptide_search::{ analyse_all_peptides, search_all_peptides, @@ -27,7 +27,8 @@ use sa_index::{ SearchResultWithAnalysis }, sa_searcher::Searcher, - suffix_to_protein_index::SparseSuffixToProtein + suffix_to_protein_index::SparseSuffixToProtein, + SuffixArray }; use sa_mappings::{ functionality::FunctionAggregator, @@ -169,7 +170,7 @@ async fn start_server(args: Arguments) -> Result<(), Box> { } = args; eprintln!("Loading suffix array..."); - let (sparseness_factor, sa) = load_suffix_array(&index_file)?; + let (sample_rate, sa) = load_suffix_array_file(&index_file)?; eprintln!("Loading taxon file..."); let taxon_id_calculator = @@ -184,7 +185,7 @@ async fn start_server(args: Arguments) -> Result<(), Box> { eprintln!("Creating searcher..."); let searcher = Arc::new(Searcher::new( sa, - sparseness_factor, + sample_rate, suffix_index_to_protein, proteins, taxon_id_calculator, @@ -210,3 +211,24 @@ async fn start_server(args: Arguments) -> Result<(), Box> { Ok(()) } + +fn load_suffix_array_file(file: &str) -> Result<(u8, SuffixArray), Box> { + // Open the suffix array file + let mut sa_file = File::open(file)?; + + // Create a buffer reader for the file + let mut reader = BufReader::new(&mut sa_file); + + // Read the flags from the binary file (1 byte) + let mut flags_buffer = [0_u8; 1]; + reader.read_exact(&mut flags_buffer).map_err(|_| "Could not read the flags from the binary file")?; + let flags = flags_buffer[0]; + + if flags == 0 { + let (sample_rate, sa) = load_suffix_array(&mut reader)?; + Ok((sample_rate, SuffixArray::Original(sa))) + } else { + let (sample_rate, sa) = load_compressed_suffix_array(&mut reader)?; + Ok((sample_rate, SuffixArray::Compressed(sa))) + } +} From 3d9c4f85efe44ea8a869336583d91a0abe412405 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 17 May 2024 18:58:47 +0200 Subject: [PATCH 09/26] dynamic resuired bits per value --- bitarray/src/binary.rs | 6 +-- bitarray/src/lib.rs | 94 +++++++++++++++++++++++++-------------- sa-builder/src/main.rs | 3 +- sa-compression/src/lib.rs | 22 ++++----- sa-index/src/binary.rs | 2 +- sa-index/src/lib.rs | 2 +- sa-server/src/main.rs | 12 ++--- 7 files changed, 84 insertions(+), 57 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index 36c8005..ecd27a5 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -30,7 +30,7 @@ pub trait Binary { } /// Implementation of the `Binary` trait for the `BitArray` struct. -impl Binary for BitArray { +impl Binary for BitArray { /// Writes the binary representation of the `BitArray` to the given writer. /// /// # Arguments @@ -161,7 +161,7 @@ mod tests { #[test] fn test_write_binary() { - let mut bitarray = BitArray::<40>::with_capacity(4); + let mut bitarray = BitArray::with_capacity(4, 40); bitarray.set(0, 0x1234567890); bitarray.set(1, 0xabcdef0123); bitarray.set(2, 0x4567890abc); @@ -185,7 +185,7 @@ mod tests { 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 ]; - let mut bitarray = BitArray::<40>::with_capacity(4); + let mut bitarray = BitArray::with_capacity(4, 40); bitarray.read_binary(&buffer[..]).unwrap(); assert_eq!(bitarray.get(0), 0x1234567890); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index e0014d3..0da16b7 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -2,22 +2,24 @@ mod binary; -use std::{io::{Result, Write}, ops::Index}; +use std::io::{Result, Write}; /// Re-export the `Binary` trait. pub use binary::Binary; /// A fixed-size bit array implementation. -pub struct BitArray { +pub struct BitArray { /// The underlying data storage for the bit array. data: Vec, /// The mask used to extract the relevant bits from each element in the data vector. mask: u64, /// The length of the bit array. - len: usize + len: usize, + /// The number of bits in a single element of the data vector. + bits_per_value: usize } -impl BitArray { +impl BitArray { /// Creates a new `BitArray` with the specified capacity. /// /// # Arguments @@ -27,11 +29,12 @@ impl BitArray { /// # Returns /// /// A new `BitArray` with the specified capacity. - pub fn with_capacity(capacity: usize) -> Self { + pub fn with_capacity(capacity: usize, bits_per_value: usize) -> Self { Self { - data: vec![0; capacity * B / 64 + 1], - mask: (1 << B) - 1, - len: capacity + data: vec![0; capacity * bits_per_value / 64 + 1], + mask: (1 << bits_per_value) - 1, + len: capacity, + bits_per_value } } @@ -45,18 +48,18 @@ impl BitArray { /// /// The value at the specified index. pub fn get(&self, index: usize) -> u64 { - let start_block = index * B / 64; - let start_block_offset = index * B % 64; + let start_block = index * self.bits_per_value / 64; + let start_block_offset = index * self.bits_per_value % 64; // If the value is contained within a single block - if start_block_offset + B <= 64 { + if start_block_offset + self.bits_per_value <= 64 { // Shift the value to the right so that the relevant bits are in the least significant // position Then mask out the irrelevant bits - return self.data[start_block] >> (64 - start_block_offset - B) & self.mask; + return self.data[start_block] >> (64 - start_block_offset - self.bits_per_value) & self.mask; } - let end_block = (index + 1) * B / 64; - let end_block_offset = (index + 1) * B % 64; + let end_block = (index + 1) * self.bits_per_value / 64; + let end_block_offset = (index + 1) * self.bits_per_value % 64; // Extract the relevant bits from the start block and shift them {end_block_offset} bits to // the left @@ -77,20 +80,20 @@ impl BitArray { /// * `index` - The index of the value to set. /// * `value` - The value to set at the specified index. pub fn set(&mut self, index: usize, value: u64) { - let start_block = index * B / 64; - let start_block_offset = index * B % 64; + let start_block = index * self.bits_per_value / 64; + let start_block_offset = index * self.bits_per_value % 64; // If the value is contained within a single block - if start_block_offset + B <= 64 { + if start_block_offset + self.bits_per_value <= 64 { // Clear the relevant bits in the start block - self.data[start_block] &= !(self.mask << (64 - start_block_offset - B)); + self.data[start_block] &= !(self.mask << (64 - start_block_offset - self.bits_per_value)); // Set the relevant bits in the start block - self.data[start_block] |= value << (64 - start_block_offset - B); + self.data[start_block] |= value << (64 - start_block_offset - self.bits_per_value); return; } - let end_block = (index + 1) * B / 64; - let end_block_offset = (index + 1) * B % 64; + let end_block = (index + 1) * self.bits_per_value / 64; + let end_block_offset = (index + 1) * self.bits_per_value % 64; // Clear the relevant bits in the start block self.data[start_block] &= !(self.mask >> start_block_offset); @@ -140,17 +143,30 @@ impl BitArray { /// # Returns /// /// A `Result` indicating whether the write operation was successful or not. -pub fn data_to_writer( +pub fn data_to_writer( data: Vec, + bits_per_value: usize, + max_capacity: usize, writer: &mut impl Write, - max_capacity: usize ) -> Result<()> { // Calculate the capacity of the bit array so the data buffer can be stored entirely // This makes the process of writing partial data to the writer easier as bounds checking is not needed - let capacity = max_capacity / (B * 64) * B * 64; + let capacity = max_capacity / (bits_per_value * 64) * bits_per_value * 64; + + // If the capacity is 0, we can write the data directly to the writer + if capacity == 0 { + let mut bitarray = BitArray::with_capacity(data.len(), bits_per_value); + + for (i, &value) in data.iter().enumerate() { + bitarray.set(i, value as u64); + } + bitarray.write_binary(writer)?; + + return Ok(()); + } // Create a bit array that can store a single chunk of data - let mut bitarray = BitArray::::with_capacity(capacity); + let mut bitarray = BitArray::with_capacity(capacity, bits_per_value); // Write the data to the writer in chunks of the specified capacity let chunks = data.chunks_exact(capacity); @@ -167,7 +183,7 @@ pub fn data_to_writer( } // Create a new bit array with the remainder capacity - bitarray = BitArray::::with_capacity(remainder.len()); + bitarray = BitArray::with_capacity(remainder.len(), bits_per_value); for (i, &value) in remainder.iter().enumerate() { bitarray.set(i, value as u64); @@ -183,7 +199,7 @@ mod tests { #[test] fn test_bitarray_with_capacity() { - let bitarray = BitArray::<40>::with_capacity(4); + let bitarray = BitArray::with_capacity(4, 40); assert_eq!(bitarray.data, vec![0, 0, 0]); assert_eq!(bitarray.mask, 0xff_ffff_ffff); assert_eq!(bitarray.len, 4); @@ -191,7 +207,7 @@ mod tests { #[test] fn test_bitarray_get() { - let mut bitarray = BitArray::<40>::with_capacity(4); + let mut bitarray = BitArray::with_capacity(4, 40); bitarray.data = vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4]; assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010); @@ -202,7 +218,7 @@ mod tests { #[test] fn test_bitarray_set() { - let mut bitarray = BitArray::<40>::with_capacity(4); + let mut bitarray = BitArray::with_capacity(4, 40); bitarray.set(0, 0b0001110011111010110001000111111100110010); bitarray.set(1, 0b1100001001010010011000010100110111001001); @@ -214,28 +230,28 @@ mod tests { #[test] fn test_bitarray_len() { - let bitarray = BitArray::<40>::with_capacity(4); + let bitarray = BitArray::with_capacity(4, 40); assert_eq!(bitarray.len(), 4); } #[test] fn test_bitarray_is_empty() { - let bitarray = BitArray::<40>::with_capacity(0); + let bitarray = BitArray::with_capacity(0, 40); assert!(bitarray.is_empty()); } #[test] fn test_bitarray_is_not_empty() { - let bitarray = BitArray::<40>::with_capacity(4); + let bitarray = BitArray::with_capacity(4, 40); assert!(!bitarray.is_empty()); } #[test] - fn test_data_to_writer() { + fn test_data_to_writer_no_chunks_needed() { let data = vec![0x1234567890, 0xabcdef0123, 0x4567890abc, 0xdef0123456]; let mut writer = Vec::new(); - data_to_writer::<40>(data, &mut writer, 2).unwrap(); + data_to_writer(data, 40, 2, &mut writer).unwrap(); assert_eq!(writer, vec![ 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, @@ -243,4 +259,14 @@ mod tests { 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 ]); } + + // #[test] + // fn test_data_to_writer_chunks_needed() { + // todo!("Implement test"); + // } + + // #[test] + // fn test_data_to_writer_chunks_needed_plus_remainder() { + // todo!("Implement test"); + // } } diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 9da2b99..ca07bff 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -45,7 +45,8 @@ fn main() { ); if compress_sa { - if let Err(err) = dump_compressed_suffix_array::<37>(sa, sparseness_factor, &mut file) { + let bits_per_value = (data.len() as f64).log2().ceil() as usize; + if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { eprint_and_exit(err.to_string().as_str()); }; } else { diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index 4ee9abf..0bbc4ae 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -2,14 +2,15 @@ use std::{error::Error, io::{BufRead, Write}}; use bitarray::{data_to_writer, Binary, BitArray}; -pub fn dump_compressed_suffix_array( +pub fn dump_compressed_suffix_array( sa: Vec, - sparseness_factor: u8, + sparseness_factor: u8, + bits_per_value: usize, writer: &mut impl Write, ) -> Result<(), Box> { // Write the flags to the writer // 00000001 indicates that the suffix array is compressed - writer.write(&[1_u8]).map_err(|_| "Could not write the flags to the writer")?; + writer.write(&[bits_per_value as u8]).map_err(|_| "Could not write the required bits to the writer")?; // Write the sparseness factor to the writer writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; @@ -18,14 +19,15 @@ pub fn dump_compressed_suffix_array( writer.write(&(sa.len() as u64).to_le_bytes()).map_err(|_| "Could not write the size of the suffix array to the writer")?; // Compress the suffix array and write it to the writer - data_to_writer::(sa, writer, 8 * 1024).map_err(|_| "Could not write the compressed suffix array to the writer")?; + data_to_writer(sa, bits_per_value, 8 * 1024, writer).map_err(|_| "Could not write the compressed suffix array to the writer")?; Ok(()) } -pub fn load_compressed_suffix_array( +pub fn load_compressed_suffix_array( reader: &mut impl BufRead, -) -> Result<(u8, BitArray), Box> { + bits_per_value: usize +) -> Result<(u8, BitArray), Box> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; reader.read_exact(&mut sample_rate_buffer).map_err(|_| "Could not read the sample rate from the binary file")?; @@ -37,7 +39,7 @@ pub fn load_compressed_suffix_array( let size = u64::from_le_bytes(size_buffer) as usize; // Read the compressed suffix array from the binary file - let mut compressed_suffix_array = BitArray::::with_capacity(size); + let mut compressed_suffix_array = BitArray::with_capacity(size, bits_per_value); compressed_suffix_array.read_binary(reader).map_err(|_| "Could not read the compressed suffix array from the binary file")?; Ok((sample_rate, compressed_suffix_array)) @@ -52,7 +54,7 @@ mod tests { let sa = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; let mut writer = vec![]; - dump_compressed_suffix_array::<8>(sa, 1, &mut writer).unwrap(); + dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); assert_eq!(writer, vec![ // flags @@ -70,8 +72,6 @@ mod tests { #[test] fn test_load_compressed_suffix_array() { let data = vec![ - // flags - 1, // sparseness factor 1, // size of the suffix array @@ -82,7 +82,7 @@ mod tests { ]; let mut reader = std::io::BufReader::new(&data[..]); - let (sample_rate, compressed_suffix_array) = load_compressed_suffix_array::<8>(&mut reader).unwrap(); + let (sample_rate, compressed_suffix_array) = load_compressed_suffix_array(&mut reader, 8).unwrap(); assert_eq!(sample_rate, 1); for i in 0..10 { diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index a86c382..7c59d0c 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -90,7 +90,7 @@ pub fn dump_suffix_array( ) -> Result<(), Box> { // Write the flags to the writer // 00000000 indicates that the suffix array is not compressed - writer.write(&[0_u8]).map_err(|_| "Could not write the flags to the writer")?; + writer.write(&[64_u8]).map_err(|_| "Could not write the flags to the writer")?; // Write the sparseness factor to the writer writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index 2958c16..45cf307 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -7,7 +7,7 @@ pub mod suffix_to_protein_index; pub enum SuffixArray { Original(Vec), - Compressed(BitArray<37>) + Compressed(BitArray) } impl SuffixArray { diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index c84cdaf..eacd212 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -219,16 +219,16 @@ fn load_suffix_array_file(file: &str) -> Result<(u8, SuffixArray), Box Date: Sat, 18 May 2024 00:32:47 +0200 Subject: [PATCH 10/26] update tests and documentation --- bitarray/src/lib.rs | 4 +- sa-builder/src/lib.rs | 116 ++++++++++++++++++++--- sa-compression/src/lib.rs | 26 ++++- sa-index/src/binary.rs | 193 +++++++++++++++++++++++++++++++++----- sa-index/src/lib.rs | 68 ++++++++++++++ 5 files changed, 367 insertions(+), 40 deletions(-) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 0da16b7..0b1f2bc 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -25,6 +25,7 @@ impl BitArray { /// # Arguments /// /// * `capacity` - The number of bits the `BitArray` can hold. + /// * `bits_per_value` - The number of bits in a single value. /// /// # Returns /// @@ -137,8 +138,9 @@ impl BitArray { /// # Arguments /// /// * `data` - The data to write. -/// * `writer` - The writer to write the data to. +/// * `bits_per_value` - The number of bits in a single value. /// * `max_capacity` - The maximum amount of elements that may be stored in the bit array. +/// * `writer` - The writer to write the data to. /// /// # Returns /// diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 29097c9..7fe320c 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -12,8 +12,8 @@ pub struct Arguments { /// concatenated using a `#`. #[arg(short, long)] pub database_file: String, - #[arg(short, long)] /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. + #[arg(short, long)] pub taxonomy: String, /// Output file to store the built index. #[arg(short, long)] @@ -22,6 +22,7 @@ pub struct Arguments { /// the SA is used) #[arg(long, default_value_t = 1)] pub sparseness_factor: u8, + /// The algorithm used to construct the suffix array (default value LibSais) #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] pub construction_algorithm: SAConstructionAlgorithm, /// If the suffix array should be compressed (default value true) @@ -36,52 +37,71 @@ pub enum SAConstructionAlgorithm { LibSais } -/// Gets the current time in ms +/// Build a sparse suffix array from the given text /// /// # Arguments -/// * `data` - The text on which we want to build the suffix array +/// * `text` - The text on which we want to build the suffix array /// * `construction_algorithm` - The algorithm used during construction /// * `sparseness_factor` - The sparseness factor used on the suffix array /// /// # Returns /// -/// Returns the constructed suffix array +/// Returns the constructed (sparse) suffix array /// /// # Errors /// /// The errors that occurred during the building of the suffix array itself pub fn build_ssa( - data: &mut Vec, + text: &mut Vec, construction_algorithm: &SAConstructionAlgorithm, sparseness_factor: u8 ) -> Result, Box> { // translate all L's to a I - translate_l_to_i(data); + translate_l_to_i(text); // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(data), - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(data) + SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(text), + SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) } .ok_or("Building suffix array failed")?; - // make the SA sparse and decrease the vector size if we have sampling (== sampling_rate > 1) - if sparseness_factor > 1 { - sample_sa(&mut sa, sparseness_factor) - } + // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) + sample_sa(&mut sa, sparseness_factor); Ok(sa) } -fn translate_l_to_i(data: &mut Vec) { - for character in data.iter_mut() { +/// Translate all L's to I's in the given text +/// +/// # Arguments +/// * `text` - The text in which we want to translate the L's to I's +/// +/// # Returns +/// +/// The text with all L's translated to I's +fn translate_l_to_i(text: &mut Vec) { + for character in text.iter_mut() { if *character == b'L' { *character = b'I' } } } +/// Sample the suffix array with the given sparseness factor +/// +/// # Arguments +/// * `sa` - The suffix array that we want to sample +/// * `sparseness_factor` - The sparseness factor used for sampling +/// +/// # Returns +/// +/// The sampled suffix array fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { + if sparseness_factor <= 1 { + return; + } + let mut current_sampled_index = 0; for i in 0 .. sa.len() { let current_sa_val = sa[i]; @@ -94,3 +114,71 @@ fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { // make shorter sa.resize(current_sampled_index, 0); } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_build_ssa_libsais() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); + } + + #[test] + fn test_build_ssa_libsais_empty() { + let mut text = b"".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + assert_eq!(sa, vec![]); + } + + #[test] + fn test_build_ssa_libsais_sparse() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 2).unwrap(); + assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); + } + + #[test] + fn test_build_ssa_libdivsufsort() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); + } + + #[test] + fn test_build_ssa_libdivsufsort_empty() { + let mut text = b"".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + assert_eq!(sa, vec![]); + } + + #[test] + fn test_build_ssa_libdivsufsort_sparse() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap(); + assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); + } + + #[test] + fn test_translate_l_to_i() { + let mut text = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ$-".to_vec(); + translate_l_to_i(&mut text); + assert_eq!(text, b"ABCDEFGHIJKIMNOPQRSTUVWXYZ$-".to_vec()); + } + + #[test] + fn test_sample_sa_1() { + let mut sa = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + sample_sa(&mut sa, 1); + assert_eq!(sa, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + } + + #[test] + fn test_sample_sa_2() { + let mut sa = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + sample_sa(&mut sa, 2); + assert_eq!(sa, vec![0, 2, 4, 6, 8]); + } +} diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index 0bbc4ae..a762bf7 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -2,6 +2,18 @@ use std::{error::Error, io::{BufRead, Write}}; use bitarray::{data_to_writer, Binary, BitArray}; +/// Writes the compressed suffix array to a writer. +/// +/// # Arguments +/// +/// * `sa` - The suffix array to be compressed. +/// * `sparseness_factor` - The sparseness factor used for compression. +/// * `bits_per_value` - The number of bits used to represent each value in the compressed array. +/// * `writer` - The writer to which the compressed array will be written. +/// +/// # Errors +/// +/// Returns an error if writing to the writer fails. pub fn dump_compressed_suffix_array( sa: Vec, sparseness_factor: u8, @@ -24,6 +36,16 @@ pub fn dump_compressed_suffix_array( Ok(()) } +/// Load the compressed suffix array from a reader. +/// +/// # Arguments +/// +/// * `reader` - The reader from which the compressed array will be read. +/// * `bits_per_value` - The number of bits used to represent each value in the compressed array. +/// +/// # Errors +/// +/// Returns an error if reading from the reader fails. pub fn load_compressed_suffix_array( reader: &mut impl BufRead, bits_per_value: usize @@ -57,8 +79,8 @@ mod tests { dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); assert_eq!(writer, vec![ - // flags - 1, + // bits per value + 8, // sparseness factor 1, // size of the suffix array diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index 7c59d0c..f9ed95d 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -25,7 +25,17 @@ pub trait Binary { fn read_binary(&mut self, reader: R) -> std::io::Result<()>; } +/// Implements the `Binary` trait for `Vec`. impl Binary for Vec { + /// Writes the elements of the vector to a binary file. + /// + /// # Arguments + /// + /// * `writer` - The writer to which the binary data will be written. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `std::io::Error` otherwise. fn write_binary(&self, writer: &mut W) -> std::io::Result<()> { for value in self { writer.write_all(&value.to_le_bytes())?; @@ -34,6 +44,15 @@ impl Binary for Vec { Ok(()) } + /// Reads binary data from a reader and populates the vector with the read values. + /// + /// # Arguments + /// + /// * `reader` - The reader from which the binary data will be read. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `std::io::Error` otherwise. fn read_binary(&mut self, mut reader: R) -> std::io::Result<()> { self.clear(); @@ -54,6 +73,39 @@ impl Binary for Vec { } } +/// Writes the suffix array to a binary file. +/// +/// # Arguments +/// +/// * `sa` - The suffix array to dump. +/// * `sparseness_factor` - The sparseness factor to write to the file. +/// * `writer` - The writer to write the binary data to. +/// +/// # Returns +/// +/// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. +pub fn dump_suffix_array( + sa: &Vec, + sparseness_factor: u8, + writer: &mut impl Write, +) -> Result<(), Box> { + // Write the required bits to the writer + // 01000000 indicates that the suffix array is not compressed + writer.write(&[64_u8]).map_err(|_| "Could not write the required bits to the writer")?; + + // Write the sparseness factor to the writer + writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; + + // Write the size of the suffix array to the writer + let sa_len = sa.len(); + writer.write(&(sa_len).to_le_bytes()).map_err(|_| "Could not write the size of the suffix array to the writer")?; + + // Write the suffix array to the writer + sa.write_binary(writer).map_err(|_| "Could not write the suffix array to the writer")?; + + Ok(()) +} + /// Loads the suffix array from the file with the given `filename` /// /// # Arguments @@ -83,29 +135,6 @@ pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Bo Ok((sample_rate, sa)) } -pub fn dump_suffix_array( - sa: &Vec, - sparseness_factor: u8, - writer: &mut impl Write, -) -> Result<(), Box> { - // Write the flags to the writer - // 00000000 indicates that the suffix array is not compressed - writer.write(&[64_u8]).map_err(|_| "Could not write the flags to the writer")?; - - // Write the sparseness factor to the writer - writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; - - // Write the size of the suffix array to the writer - let sa_len = sa.len(); - writer.write(&(sa_len).to_le_bytes()).map_err(|_| "Could not write the size of the suffix array to the writer")?; - - // Write the suffix array to the writer - let sa = Vec::with_capacity(sa_len); - sa.write_binary(writer).map_err(|_| "Could not write the suffix array to the writer")?; - - Ok(()) -} - /// Fills the buffer with data read from the input. /// /// # Arguments @@ -147,3 +176,121 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { } } } + +#[cfg(test)] +mod tests { + use super::*; + + pub struct ErrorInput; + + impl Read for ErrorInput { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + Err(std::io::Error::new(std::io::ErrorKind::Other, "read error")) + } + } + + #[test] + fn test_fill_buffer() { + let input_str = "a".repeat(8_000); + let mut input = input_str.as_bytes(); + + let mut buffer = vec![0; 800]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); + + if finished { + assert!(bytes_read < 800); + break; + } else { + assert_eq!(bytes_read, 800); + } + } + } + + #[test] + #[should_panic(expected = "Error while reading input:")] + fn test_fill_buffer_read_error() { + let mut input = ErrorInput; + let mut buffer = vec![0; 800]; + + fill_buffer(&mut input, &mut buffer); + } + + #[test] + fn test_write_binary() { + let mut buffer = Vec::new(); + let values = vec![1, 2, 3, 4, 5]; + + values.write_binary(&mut buffer).unwrap(); + + assert_eq!(buffer, vec![ + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0 + ]); + } + + #[test] + fn test_read_binary() { + let buffer = vec![ + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0 + ]; + + let mut values = Vec::new(); + values.read_binary(buffer.as_slice()).unwrap(); + + assert_eq!(values, vec![1, 2, 3, 4, 5]); + } + + #[test] + fn test_dump_suffix_array() { + let mut buffer = Vec::new(); + let sa = vec![1, 2, 3, 4, 5]; + + dump_suffix_array(&sa, 1, &mut buffer).unwrap(); + + assert_eq!(buffer, vec![ + // required bits + 64, + // Sparseness factor + 1, + // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, + // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0 + ]); + } + + #[test] + fn test_load_suffix_array() { + let buffer = vec![ + // Sample rate + 1, + // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, + // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0 + ]; + + let mut reader = buffer.as_slice(); + let (sample_rate, sa) = load_suffix_array(&mut reader).unwrap(); + + assert_eq!(sample_rate, 1); + assert_eq!(sa, vec![1, 2, 3, 4, 5]); + } +} diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index 45cf307..6ffc0e9 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -5,12 +5,20 @@ pub mod peptide_search; pub mod sa_searcher; pub mod suffix_to_protein_index; +/// Represents a suffix array. pub enum SuffixArray { + /// The original suffix array. Original(Vec), + /// The compressed suffix array. Compressed(BitArray) } impl SuffixArray { + /// Returns the length of the suffix array. + /// + /// # Returns + /// + /// The length of the suffix array. pub fn len(&self) -> usize { match self { SuffixArray::Original(sa) => sa.len(), @@ -18,6 +26,15 @@ impl SuffixArray { } } + /// Returns the suffix array at the given index. + /// + /// # Arguments + /// + /// * `index` - The index of the suffix array. + /// + /// # Returns + /// + /// The suffix array at the given index. pub fn get(&self, index: usize) -> i64 { match self { SuffixArray::Original(sa) => sa[index], @@ -25,6 +42,11 @@ impl SuffixArray { } } + /// Returns whether the suffix array is empty. + /// + /// # Returns + /// + /// True if the suffix array is empty, false otherwise. pub fn is_empty(&self) -> bool { self.len() == 0 } @@ -34,9 +56,15 @@ impl SuffixArray { pub trait Nullable { const NULL: T; + /// Returns whether the value is NULL. + /// + /// # Returns + /// + /// True if the value is NULL, false otherwise. fn is_null(&self) -> bool; } +/// Implementation of the `Nullable` trait for the `u32` type. impl Nullable for u32 { const NULL: u32 = u32::MAX; @@ -44,3 +72,43 @@ impl Nullable for u32 { *self == Self::NULL } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_suffix_array_original() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5]); + assert_eq!(sa.len(), 5); + assert_eq!(sa.get(0), 1); + assert_eq!(sa.get(1), 2); + assert_eq!(sa.get(2), 3); + assert_eq!(sa.get(3), 4); + assert_eq!(sa.get(4), 5); + } + + #[test] + fn test_suffix_array_compressed() { + let mut bitarray = BitArray::with_capacity(5, 40); + bitarray.set(0, 1); + bitarray.set(1, 2); + bitarray.set(2, 3); + bitarray.set(3, 4); + bitarray.set(4, 5); + + let sa = SuffixArray::Compressed(bitarray); + assert_eq!(sa.len(), 5); + assert_eq!(sa.get(0), 1); + assert_eq!(sa.get(1), 2); + assert_eq!(sa.get(2), 3); + assert_eq!(sa.get(3), 4); + assert_eq!(sa.get(4), 5); + } + + #[test] + fn test_nullable_is_null() { + assert_eq!(u32::NULL.is_null(), true); + assert_eq!(0u32.is_null(), false); + } +} From 58772f1ba93571d9e9c7746b3080f3c4191b25dc Mon Sep 17 00:00:00 2001 From: tibvdm Date: Sat, 18 May 2024 00:39:04 +0200 Subject: [PATCH 11/26] update coverage config --- .github/workflows/coverage.yml | 76 +++++++++++++++++++++++++++++++++- codecov.yml | 38 ++++++++++++++--- 2 files changed, 107 insertions(+), 7 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 88368eb..f015f40 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -71,6 +71,80 @@ jobs: + - name: Run cargo test (sa-builder) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p sa-builder + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (sa-builder) + id: coverage-sa-builder + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (sa-builder) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-sa-builder.outputs.report }} + flags: sa-builder + verbose: true + fail_ci_if_error: true + + + + - name: Run cargo test (sa-compression) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p sa-compression + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (sa-compression) + id: coverage-sa-compression + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (sa-compression) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-sa-compression.outputs.report }} + flags: sa-compression + verbose: true + fail_ci_if_error: true + + + + - name: Run cargo test (sa-index) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p sa-index + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (sa-index) + id: coverage-sa-index + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (sa-index) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-sa-index.outputs.report }} + flags: sa-index + verbose: true + fail_ci_if_error: true + + - name: Run cargo test (sa-mappings) uses: actions-rs/cargo@v1 with: @@ -92,4 +166,4 @@ jobs: file: ${{ steps.coverage-sa-mappings.outputs.report }} flags: sa-mappings verbose: true - fail_ci_if_error: true + fail_ci_if_error: true \ No newline at end of file diff --git a/codecov.yml b/codecov.yml index 6d769bf..4056f16 100644 --- a/codecov.yml +++ b/codecov.yml @@ -4,30 +4,44 @@ coverage: default: target: 90% bitarray: - target: 90% flags: - bitarray fa-compression: - target: 90% flags: - fa-compression + sa-builder: + flags: + - sa-builder + sa-compression: + flags: + - sa-compression + sa-index: + target: 65% + flags: + - sa-index sa-mappings: - target: 90% flags: - sa-mappings patch: default: target: 90% bitarray: - target: 90% flags: - bitarray fa-compression: - target: 90% flags: - fa-compression + sa-builder: + flags: + - sa-builder + sa-compression: + flags: + - sa-compression + sa-index: + target: 65% + flags: + - sa-index sa-mappings: - target: 90% flags: - sa-mappings @@ -40,6 +54,18 @@ flags: paths: - fa-compression carryforward: true + sa-builder: + paths: + - sa-builder + carryforward: true + sa-compression: + paths: + - sa-compression + carryforward: true + sa-index: + paths: + - sa-index + carryforward: true sa-mappings: paths: - sa-mappings From 9f3d6f46ed1f60382d1f42354b29bf9b1be3eedb Mon Sep 17 00:00:00 2001 From: tibvdm Date: Sat, 18 May 2024 00:41:59 +0200 Subject: [PATCH 12/26] fmt + clippy --- bitarray/src/binary.rs | 39 +++++++------ bitarray/src/lib.rs | 38 ++++++++----- sa-builder/src/lib.rs | 16 +++--- sa-builder/src/main.rs | 42 +++++++------- sa-compression/src/lib.rs | 80 ++++++++++++++++---------- sa-index/src/binary.rs | 108 +++++++++++++++++++----------------- sa-index/src/lib.rs | 20 +++---- sa-index/src/sa_searcher.rs | 6 +- sa-server/src/main.rs | 12 +++- 9 files changed, 209 insertions(+), 152 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index ecd27a5..59b3293 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -1,6 +1,11 @@ //! This module provides utilities for reading and writing the bitarray as binary. -use std::io::{BufRead, Read, Result, Write}; +use std::io::{ + BufRead, + Read, + Result, + Write +}; use crate::BitArray; @@ -61,11 +66,12 @@ impl Binary for BitArray { self.data.clear(); let mut buffer = vec![0; 8 * 1024]; - + loop { let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); - for buffer_slice in buffer[..bytes_read].chunks_exact(8) { - self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); + for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { + self.data + .push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); } if finished { @@ -86,8 +92,8 @@ impl Binary for BitArray { /// /// # Returns /// -/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input is reached, -/// and `bytes_read` is the number of bytes read into the buffer. +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input +/// is reached, and `bytes_read` is the number of bytes read into the buffer. fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // Store the buffer size in advance, because rust will complain // about the buffer being borrowed mutably while it's borrowed @@ -109,7 +115,7 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // We've read {bytes_read} bytes Ok(bytes_read) => { // Shrink the writable buffer slice - writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); } Err(err) => { @@ -137,7 +143,7 @@ mod tests { let mut input = input_str.as_bytes(); let mut buffer = vec![0; 800]; - + loop { let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); @@ -170,19 +176,20 @@ mod tests { let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); - assert_eq!(buffer, vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, - 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ]); + assert_eq!( + buffer, + vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ] + ); } #[test] fn test_read_binary() { let buffer = vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, - 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0, ]; let mut bitarray = BitArray::with_capacity(4, 40); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 0b1f2bc..0ff6b9a 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -2,7 +2,10 @@ mod binary; -use std::io::{Result, Write}; +use std::io::{ + Result, + Write +}; /// Re-export the `Binary` trait. pub use binary::Binary; @@ -10,11 +13,11 @@ pub use binary::Binary; /// A fixed-size bit array implementation. pub struct BitArray { /// The underlying data storage for the bit array. - data: Vec, + data: Vec, /// The mask used to extract the relevant bits from each element in the data vector. - mask: u64, + mask: u64, /// The length of the bit array. - len: usize, + len: usize, /// The number of bits in a single element of the data vector. bits_per_value: usize } @@ -34,7 +37,7 @@ impl BitArray { Self { data: vec![0; capacity * bits_per_value / 64 + 1], mask: (1 << bits_per_value) - 1, - len: capacity, + len: capacity, bits_per_value } } @@ -56,7 +59,8 @@ impl BitArray { if start_block_offset + self.bits_per_value <= 64 { // Shift the value to the right so that the relevant bits are in the least significant // position Then mask out the irrelevant bits - return self.data[start_block] >> (64 - start_block_offset - self.bits_per_value) & self.mask; + return self.data[start_block] >> (64 - start_block_offset - self.bits_per_value) + & self.mask; } let end_block = (index + 1) * self.bits_per_value / 64; @@ -87,7 +91,8 @@ impl BitArray { // If the value is contained within a single block if start_block_offset + self.bits_per_value <= 64 { // Clear the relevant bits in the start block - self.data[start_block] &= !(self.mask << (64 - start_block_offset - self.bits_per_value)); + self.data[start_block] &= + !(self.mask << (64 - start_block_offset - self.bits_per_value)); // Set the relevant bits in the start block self.data[start_block] |= value << (64 - start_block_offset - self.bits_per_value); return; @@ -146,13 +151,14 @@ impl BitArray { /// /// A `Result` indicating whether the write operation was successful or not. pub fn data_to_writer( - data: Vec, + data: Vec, bits_per_value: usize, max_capacity: usize, - writer: &mut impl Write, + writer: &mut impl Write ) -> Result<()> { // Calculate the capacity of the bit array so the data buffer can be stored entirely - // This makes the process of writing partial data to the writer easier as bounds checking is not needed + // This makes the process of writing partial data to the writer easier as bounds checking is not + // needed let capacity = max_capacity / (bits_per_value * 64) * bits_per_value * 64; // If the capacity is 0, we can write the data directly to the writer @@ -255,11 +261,13 @@ mod tests { data_to_writer(data, 40, 2, &mut writer).unwrap(); - assert_eq!(writer, vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, - 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ]); + assert_eq!( + writer, + vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ] + ); } // #[test] diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 7fe320c..41cb0d2 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -73,14 +73,14 @@ pub fn build_ssa( } /// Translate all L's to I's in the given text -/// +/// /// # Arguments /// * `text` - The text in which we want to translate the L's to I's -/// +/// /// # Returns -/// +/// /// The text with all L's translated to I's -fn translate_l_to_i(text: &mut Vec) { +fn translate_l_to_i(text: &mut [u8]) { for character in text.iter_mut() { if *character == b'L' { *character = b'I' @@ -89,13 +89,13 @@ fn translate_l_to_i(text: &mut Vec) { } /// Sample the suffix array with the given sparseness factor -/// +/// /// # Arguments /// * `sa` - The suffix array that we want to sample /// * `sparseness_factor` - The sparseness factor used for sampling -/// +/// /// # Returns -/// +/// /// The sampled suffix array fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { if sparseness_factor <= 1 { @@ -110,7 +110,7 @@ fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { current_sampled_index += 1; } } - + // make shorter sa.resize(current_sampled_index, 0); } diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index ca07bff..a78fdce 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,12 +1,18 @@ -use std::{fs::{File, OpenOptions}, io::Result}; +use std::{ + fs::{ + File, + OpenOptions + }, + io::Result +}; use clap::Parser; use sa_builder::{ build_ssa, Arguments }; -use sa_index::binary::dump_suffix_array; use sa_compression::dump_compressed_suffix_array; +use sa_index::binary::dump_suffix_array; use sa_mappings::{ proteins::Proteins, taxonomy::{ @@ -25,34 +31,32 @@ fn main() { compress_sa } = Arguments::parse(); - let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar).unwrap_or_else( - |err| eprint_and_exit(err.to_string().as_str()) - ); + let taxon_id_calculator = + TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); // read input - let mut data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator).unwrap_or_else( - |err| eprint_and_exit(err.to_string().as_str()) - ); + let mut data = + Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); // calculate sparse suffix array - let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor).unwrap_or_else( - |err| eprint_and_exit(err.to_string().as_str()) - ); + let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); // open the output file - let mut file = open_file(&output).unwrap_or_else( - |err| eprint_and_exit(err.to_string().as_str()) - ); + let mut file = + open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); if compress_sa { let bits_per_value = (data.len() as f64).log2().ceil() as usize; - if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { - eprint_and_exit(err.to_string().as_str()); - }; - } else { - if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { + if let Err(err) = + dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) + { eprint_and_exit(err.to_string().as_str()); }; + } else if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { + eprint_and_exit(err.to_string().as_str()); } } diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index a762bf7..62362ac 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -1,6 +1,16 @@ -use std::{error::Error, io::{BufRead, Write}}; +use std::{ + error::Error, + io::{ + BufRead, + Write + } +}; -use bitarray::{data_to_writer, Binary, BitArray}; +use bitarray::{ + data_to_writer, + Binary, + BitArray +}; /// Writes the compressed suffix array to a writer. /// @@ -15,23 +25,30 @@ use bitarray::{data_to_writer, Binary, BitArray}; /// /// Returns an error if writing to the writer fails. pub fn dump_compressed_suffix_array( - sa: Vec, + sa: Vec, sparseness_factor: u8, bits_per_value: usize, - writer: &mut impl Write, + writer: &mut impl Write ) -> Result<(), Box> { // Write the flags to the writer // 00000001 indicates that the suffix array is compressed - writer.write(&[bits_per_value as u8]).map_err(|_| "Could not write the required bits to the writer")?; + writer + .write(&[bits_per_value as u8]) + .map_err(|_| "Could not write the required bits to the writer")?; // Write the sparseness factor to the writer - writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; + writer + .write(&[sparseness_factor]) + .map_err(|_| "Could not write the sparseness factor to the writer")?; // Write the size of the suffix array to the writer - writer.write(&(sa.len() as u64).to_le_bytes()).map_err(|_| "Could not write the size of the suffix array to the writer")?; + writer + .write(&(sa.len() as u64).to_le_bytes()) + .map_err(|_| "Could not write the size of the suffix array to the writer")?; // Compress the suffix array and write it to the writer - data_to_writer(sa, bits_per_value, 8 * 1024, writer).map_err(|_| "Could not write the compressed suffix array to the writer")?; + data_to_writer(sa, bits_per_value, 8 * 1024, writer) + .map_err(|_| "Could not write the compressed suffix array to the writer")?; Ok(()) } @@ -52,17 +69,23 @@ pub fn load_compressed_suffix_array( ) -> Result<(u8, BitArray), Box> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; - reader.read_exact(&mut sample_rate_buffer).map_err(|_| "Could not read the sample rate from the binary file")?; + reader + .read_exact(&mut sample_rate_buffer) + .map_err(|_| "Could not read the sample rate from the binary file")?; let sample_rate = sample_rate_buffer[0]; // Read the size of the suffix array from the binary file (8 bytes) let mut size_buffer = [0_u8; 8]; - reader.read_exact(&mut size_buffer).map_err(|_| "Could not read the size of the suffix array from the binary file")?; + reader + .read_exact(&mut size_buffer) + .map_err(|_| "Could not read the size of the suffix array from the binary file")?; let size = u64::from_le_bytes(size_buffer) as usize; // Read the compressed suffix array from the binary file let mut compressed_suffix_array = BitArray::with_capacity(size, bits_per_value); - compressed_suffix_array.read_binary(reader).map_err(|_| "Could not read the compressed suffix array from the binary file")?; + compressed_suffix_array + .read_binary(reader) + .map_err(|_| "Could not read the compressed suffix array from the binary file")?; Ok((sample_rate, compressed_suffix_array)) } @@ -78,36 +101,33 @@ mod tests { let mut writer = vec![]; dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); - assert_eq!(writer, vec![ - // bits per value - 8, - // sparseness factor - 1, - // size of the suffix array - 10, 0, 0, 0, 0, 0, 0, 0, - // compressed suffix array - 8, 7, 6, 5, 4, 3, 2, 1, - 0, 0, 0, 0, 0, 0, 10, 9 - ]); + assert_eq!( + writer, + vec![ + // bits per value + 8, // sparseness factor + 1, // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 + ] + ); } #[test] fn test_load_compressed_suffix_array() { let data = vec![ // sparseness factor - 1, - // size of the suffix array - 10, 0, 0, 0, 0, 0, 0, 0, - // compressed suffix array - 8, 7, 6, 5, 4, 3, 2, 1, - 0, 0, 0, 0, 0, 0, 10, 9 + 1, // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9, ]; let mut reader = std::io::BufReader::new(&data[..]); - let (sample_rate, compressed_suffix_array) = load_compressed_suffix_array(&mut reader, 8).unwrap(); + let (sample_rate, compressed_suffix_array) = + load_compressed_suffix_array(&mut reader, 8).unwrap(); assert_eq!(sample_rate, 1); - for i in 0..10 { + for i in 0 .. 10 { assert_eq!(compressed_suffix_array.get(i), i as u64 + 1); } } diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index f9ed95d..48e1e84 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -1,4 +1,11 @@ -use std::{error::Error, io::{BufRead, Read, Write}}; +use std::{ + error::Error, + io::{ + BufRead, + Read, + Write + } +}; /// The `Binary` trait provides methods for reading and writing a struct as binary. pub trait Binary { @@ -57,10 +64,10 @@ impl Binary for Vec { self.clear(); let mut buffer = vec![0; 8 * 1024]; - + loop { let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); - for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); } @@ -87,21 +94,28 @@ impl Binary for Vec { pub fn dump_suffix_array( sa: &Vec, sparseness_factor: u8, - writer: &mut impl Write, + writer: &mut impl Write ) -> Result<(), Box> { // Write the required bits to the writer // 01000000 indicates that the suffix array is not compressed - writer.write(&[64_u8]).map_err(|_| "Could not write the required bits to the writer")?; + writer + .write(&[64_u8]) + .map_err(|_| "Could not write the required bits to the writer")?; // Write the sparseness factor to the writer - writer.write(&[sparseness_factor]).map_err(|_| "Could not write the sparseness factor to the writer")?; + writer + .write(&[sparseness_factor]) + .map_err(|_| "Could not write the sparseness factor to the writer")?; // Write the size of the suffix array to the writer let sa_len = sa.len(); - writer.write(&(sa_len).to_le_bytes()).map_err(|_| "Could not write the size of the suffix array to the writer")?; + writer + .write(&(sa_len).to_le_bytes()) + .map_err(|_| "Could not write the size of the suffix array to the writer")?; // Write the suffix array to the writer - sa.write_binary(writer).map_err(|_| "Could not write the suffix array to the writer")?; + sa.write_binary(writer) + .map_err(|_| "Could not write the suffix array to the writer")?; Ok(()) } @@ -121,16 +135,21 @@ pub fn dump_suffix_array( pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Box> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; - reader.read_exact(&mut sample_rate_buffer).map_err(|_| "Could not read the sample rate from the binary file")?; + reader + .read_exact(&mut sample_rate_buffer) + .map_err(|_| "Could not read the sample rate from the binary file")?; let sample_rate = sample_rate_buffer[0]; // Read the size of the suffix array from the binary file (8 bytes) let mut size_buffer = [0_u8; 8]; - reader.read_exact(&mut size_buffer).map_err(|_| "Could not read the size of the suffix array from the binary file")?; + reader + .read_exact(&mut size_buffer) + .map_err(|_| "Could not read the size of the suffix array from the binary file")?; let size = u64::from_le_bytes(size_buffer) as usize; let mut sa = Vec::with_capacity(size); - sa.read_binary(reader).map_err(|_| "Could not read the suffix array from the binary file")?; + sa.read_binary(reader) + .map_err(|_| "Could not read the suffix array from the binary file")?; Ok((sample_rate, sa)) } @@ -144,8 +163,8 @@ pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Bo /// /// # Returns /// -/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input is reached, -/// and `bytes_read` is the number of bytes read into the buffer. +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input +/// is reached, and `bytes_read` is the number of bytes read into the buffer. fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // Store the buffer size in advance, because rust will complain // about the buffer being borrowed mutably while it's borrowed @@ -167,7 +186,7 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // We've read {bytes_read} bytes Ok(bytes_read) => { // Shrink the writable buffer slice - writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); } Err(err) => { @@ -195,7 +214,7 @@ mod tests { let mut input = input_str.as_bytes(); let mut buffer = vec![0; 800]; - + loop { let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); @@ -224,23 +243,20 @@ mod tests { values.write_binary(&mut buffer).unwrap(); - assert_eq!(buffer, vec![ - 1, 0, 0, 0, 0, 0, 0, 0, - 2, 0, 0, 0, 0, 0, 0, 0, - 3, 0, 0, 0, 0, 0, 0, 0, - 4, 0, 0, 0, 0, 0, 0, 0, - 5, 0, 0, 0, 0, 0, 0, 0 - ]); + assert_eq!( + buffer, + vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0 + ] + ); } #[test] fn test_read_binary() { let buffer = vec![ - 1, 0, 0, 0, 0, 0, 0, 0, - 2, 0, 0, 0, 0, 0, 0, 0, - 3, 0, 0, 0, 0, 0, 0, 0, - 4, 0, 0, 0, 0, 0, 0, 0, - 5, 0, 0, 0, 0, 0, 0, 0 + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, + 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ]; let mut values = Vec::new(); @@ -256,35 +272,27 @@ mod tests { dump_suffix_array(&sa, 1, &mut buffer).unwrap(); - assert_eq!(buffer, vec![ - // required bits - 64, - // Sparseness factor - 1, - // Size of the suffix array - 5, 0, 0, 0, 0, 0, 0, 0, - // Suffix array - 1, 0, 0, 0, 0, 0, 0, 0, - 2, 0, 0, 0, 0, 0, 0, 0, - 3, 0, 0, 0, 0, 0, 0, 0, - 4, 0, 0, 0, 0, 0, 0, 0, - 5, 0, 0, 0, 0, 0, 0, 0 - ]); + assert_eq!( + buffer, + vec![ + // required bits + 64, // Sparseness factor + 1, // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0 + ] + ); } #[test] fn test_load_suffix_array() { let buffer = vec![ // Sample rate - 1, - // Size of the suffix array - 5, 0, 0, 0, 0, 0, 0, 0, - // Suffix array - 1, 0, 0, 0, 0, 0, 0, 0, - 2, 0, 0, 0, 0, 0, 0, 0, - 3, 0, 0, 0, 0, 0, 0, 0, - 4, 0, 0, 0, 0, 0, 0, 0, - 5, 0, 0, 0, 0, 0, 0, 0 + 1, // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, + 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ]; let mut reader = buffer.as_slice(); diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index 6ffc0e9..9e2f769 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -15,9 +15,9 @@ pub enum SuffixArray { impl SuffixArray { /// Returns the length of the suffix array. - /// + /// /// # Returns - /// + /// /// The length of the suffix array. pub fn len(&self) -> usize { match self { @@ -27,13 +27,13 @@ impl SuffixArray { } /// Returns the suffix array at the given index. - /// + /// /// # Arguments - /// + /// /// * `index` - The index of the suffix array. - /// + /// /// # Returns - /// + /// /// The suffix array at the given index. pub fn get(&self, index: usize) -> i64 { match self { @@ -43,9 +43,9 @@ impl SuffixArray { } /// Returns whether the suffix array is empty. - /// + /// /// # Returns - /// + /// /// True if the suffix array is empty, false otherwise. pub fn is_empty(&self) -> bool { self.len() == 0 @@ -57,9 +57,9 @@ pub trait Nullable { const NULL: T; /// Returns whether the value is NULL. - /// + /// /// # Returns - /// + /// /// True if the value is NULL, false otherwise. fn is_null(&self) -> bool; } diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index b4c00a5..5bf924b 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -19,7 +19,8 @@ use crate::{ Minimum }, suffix_to_protein_index::SuffixToProteinIndex, - Nullable, SuffixArray + Nullable, + SuffixArray }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -240,7 +241,8 @@ impl Searcher { while right - left > 1 { let center = (left + right) / 2; let skip = min(lcp_left, lcp_right); - let (retval, lcp_center) = self.compare(search_string, self.sa.get(center), skip, bound); + let (retval, lcp_center) = + self.compare(search_string, self.sa.get(center), skip, bound); found |= lcp_center == search_string.len(); diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index eacd212..7fb3322 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -1,5 +1,11 @@ use std::{ - error::Error, fs::File, io::{BufReader, Read}, sync::Arc + error::Error, + fs::File, + io::{ + BufReader, + Read + }, + sync::Arc }; use axum::{ @@ -221,7 +227,9 @@ fn load_suffix_array_file(file: &str) -> Result<(u8, SuffixArray), Box Date: Sat, 18 May 2024 00:45:14 +0200 Subject: [PATCH 13/26] format --all --- sa-index/src/binary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index 48e1e84..5fc5932 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -277,7 +277,7 @@ mod tests { vec![ // required bits 64, // Sparseness factor - 1, // Size of the suffix array + 1, // Size of the suffix array 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0 From f26cf34c1074be1203b9beeeccab544da0d8cff6 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Sat, 18 May 2024 03:26:47 +0200 Subject: [PATCH 14/26] improve testing --- bitarray/src/binary.rs | 18 ++-- bitarray/src/lib.rs | 104 +++++++++++++++--- sa-builder/src/lib.rs | 28 ++++- sa-compression/src/lib.rs | 116 ++++++++++++++++++++ sa-index/src/binary.rs | 134 +++++++++++++++++++++--- sa-index/src/lib.rs | 10 ++ sa-index/src/suffix_to_protein_index.rs | 8 ++ sa-mappings/src/functionality.rs | 72 ++++++++++--- 8 files changed, 433 insertions(+), 57 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index 59b3293..2609c3b 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -68,7 +68,7 @@ impl Binary for BitArray { let mut buffer = vec![0; 8 * 1024]; loop { - let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { self.data .push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); @@ -94,7 +94,7 @@ impl Binary for BitArray { /// /// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input /// is reached, and `bytes_read` is the number of bytes read into the buffer. -fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { +fn fill_buffer(input: &mut T, buffer: &mut Vec) -> std::io::Result<(bool, usize)> { // Store the buffer size in advance, because rust will complain // about the buffer being borrowed mutably while it's borrowed let buffer_size = buffer.len(); @@ -106,10 +106,10 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // No bytes written, which means we've completely filled the buffer // or we've reached the end of the file Ok(0) => { - return ( + return Ok(( !writable_buffer_space.is_empty(), buffer_size - writable_buffer_space.len() - ); + )); } // We've read {bytes_read} bytes @@ -118,8 +118,9 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); } - Err(err) => { - panic!("Error while reading input: {}", err); + // An error occurred while reading + Err(e) => { + return Err(e); } } } @@ -145,7 +146,7 @@ mod tests { let mut buffer = vec![0; 800]; loop { - let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); + let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer).unwrap(); if finished { assert!(bytes_read < 800); @@ -157,12 +158,11 @@ mod tests { } #[test] - #[should_panic(expected = "Error while reading input:")] fn test_fill_buffer_read_error() { let mut input = ErrorInput; let mut buffer = vec![0; 800]; - fill_buffer(&mut input, &mut buffer); + assert!(fill_buffer(&mut input, &mut buffer).is_err()); } #[test] diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 0ff6b9a..d2e000c 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -2,10 +2,10 @@ mod binary; -use std::io::{ +use std::{cmp::max, io::{ Result, Write -}; +}}; /// Re-export the `Binary` trait. pub use binary::Binary; @@ -34,8 +34,9 @@ impl BitArray { /// /// A new `BitArray` with the specified capacity. pub fn with_capacity(capacity: usize, bits_per_value: usize) -> Self { + let extra = if capacity * bits_per_value % 64 == 0 { 0 } else { 1 }; Self { - data: vec![0; capacity * bits_per_value / 64 + 1], + data: vec![0; capacity * bits_per_value / 64 + extra], mask: (1 << bits_per_value) - 1, len: capacity, bits_per_value @@ -156,13 +157,13 @@ pub fn data_to_writer( max_capacity: usize, writer: &mut impl Write ) -> Result<()> { - // Calculate the capacity of the bit array so the data buffer can be stored entirely - // This makes the process of writing partial data to the writer easier as bounds checking is not - // needed - let capacity = max_capacity / (bits_per_value * 64) * bits_per_value * 64; + // Update the max capacity to be a multiple of the greatest common divisor of the bits per value + // and 64. This is done to ensure that the bit array can store the data entirely + let greates_common_divisor = gcd(bits_per_value, 64); + let capacity = max(greates_common_divisor, max_capacity / greates_common_divisor * greates_common_divisor); - // If the capacity is 0, we can write the data directly to the writer - if capacity == 0 { + // If amount of data is less than the max capacity, write the data to the writer in a single chunk + if data.len() <= capacity { let mut bitarray = BitArray::with_capacity(data.len(), bits_per_value); for (i, &value) in data.iter().enumerate() { @@ -201,6 +202,26 @@ pub fn data_to_writer( Ok(()) } +/// Calculates the greatest common divisor of two numbers. +/// +/// # Arguments +/// +/// * `a` - The first number. +/// * `b` - The second number. +/// +/// # Returns +/// +/// The greatest common divisor of the two numbers. +fn gcd(mut a: usize, mut b: usize) -> usize { + while b != 0 { + if b < a { + std::mem::swap(&mut b, &mut a); + } + b %= a; + } + a +} + #[cfg(test)] mod tests { use super::*; @@ -254,6 +275,16 @@ mod tests { assert!(!bitarray.is_empty()); } + #[test] + fn test_bitarray_clear() { + let mut bitarray = BitArray::with_capacity(4, 40); + bitarray.data = vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4]; + + bitarray.clear(); + + assert_eq!(bitarray.data, vec![0, 0, 0]); + } + #[test] fn test_data_to_writer_no_chunks_needed() { let data = vec![0x1234567890, 0xabcdef0123, 0x4567890abc, 0xdef0123456]; @@ -270,13 +301,52 @@ mod tests { ); } - // #[test] - // fn test_data_to_writer_chunks_needed() { - // todo!("Implement test"); - // } + #[test] + fn test_data_to_writer_chunks_needed_no_remainder() { + let data = vec![ + 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, + 0x88888888 + ]; + let mut writer = Vec::new(); + + data_to_writer(data, 32, 8, &mut writer).unwrap(); - // #[test] - // fn test_data_to_writer_chunks_needed_plus_remainder() { - // todo!("Implement test"); - // } + assert_eq!( + writer, + vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, + 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77 + ] + ); + } + + #[test] + fn test_data_to_writer_chunks_needed_plus_remainder() { + let data = vec![ + 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, + 0x88888888, 0x99999999 + ]; + let mut writer = Vec::new(); + + data_to_writer(data, 32, 8, &mut writer).unwrap(); + + assert_eq!( + writer, + vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, + 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0x00, 0x00, 0x00, 0x00, 0x99, 0x99, 0x99, + 0x99 + ] + ); + } + + #[test] + fn test_gcd() { + assert_eq!(gcd(40, 64), 8); + assert_eq!(gcd(64, 40), 8); + assert_eq!(gcd(64, 64), 64); + assert_eq!(gcd(32, 64), 32); + } } diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 41cb0d2..1e6d834 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -26,7 +26,7 @@ pub struct Arguments { #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] pub construction_algorithm: SAConstructionAlgorithm, /// If the suffix array should be compressed (default value true) - #[arg(long, default_value_t = true)] + #[arg(long, default_value_t = false)] pub compress_sa: bool } @@ -119,6 +119,32 @@ fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { mod tests { use super::*; + #[test] + fn test_arguments() { + let args = Arguments::parse_from(&[ + "sa-builder", + "--database-file", "database.fa", + "--taxonomy", "taxonomy.tsv", + "--output", "output.fa", + "--sparseness-factor", "2", + "--construction-algorithm", "lib-div-suf-sort", + "--compress-sa" + ]); + + assert_eq!(args.database_file, "database.fa"); + assert_eq!(args.taxonomy, "taxonomy.tsv"); + assert_eq!(args.output, "output.fa"); + assert_eq!(args.sparseness_factor, 2); + assert_eq!(args.construction_algorithm, SAConstructionAlgorithm::LibDivSufSort); + assert_eq!(args.compress_sa, true); + } + + #[test] + fn test_sa_construction_algorithm() { + assert_eq!(SAConstructionAlgorithm::from_str("lib-div-suf-sort", false), Ok(SAConstructionAlgorithm::LibDivSufSort)); + assert_eq!(SAConstructionAlgorithm::from_str("lib-sais", false), Ok(SAConstructionAlgorithm::LibSais)); + } + #[test] fn test_build_ssa_libsais() { let mut text = b"ABRACADABRA$".to_vec(); diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index 62362ac..1148aca 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -92,8 +92,54 @@ pub fn load_compressed_suffix_array( #[cfg(test)] mod tests { + use std::io::Read; + use super::*; + pub struct FailingWriter { + /// The number of times the write function can be called before it fails. + pub valid_write_count: usize + } + + impl Write for FailingWriter { + fn write(&mut self, _: &[u8]) -> Result { + if self.valid_write_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed")); + } + + self.valid_write_count -= 1; + Ok(1) + } + + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + + pub struct FailingReader { + /// The number of times the read function can be called before it fails. + pub valid_read_count: usize + } + + impl Read for FailingReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.valid_read_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed")); + } + + self.valid_read_count -= 1; + Ok(buf.len()) + } + } + + impl BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Ok(&[]) + } + + fn consume(&mut self, _: usize) {} + } + #[test] fn test_dump_compressed_suffix_array() { let sa = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; @@ -113,6 +159,46 @@ mod tests { ); } + #[test] + #[should_panic(expected = "Could not write the required bits to the writer")] + fn test_dump_compressed_suffix_array_fail_required_bits() { + let mut writer = FailingWriter { + valid_write_count: 0 + }; + + dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the sparseness factor to the writer")] + fn test_dump_compressed_suffix_array_fail_sparseness_factor() { + let mut writer = FailingWriter { + valid_write_count: 1 + }; + + dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the size of the suffix array to the writer")] + fn test_dump_compressed_suffix_array_fail_size() { + let mut writer = FailingWriter { + valid_write_count: 2 + }; + + dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the compressed suffix array to the writer")] + fn test_dump_compressed_suffix_array_fail_compressed_suffix_array() { + let mut writer = FailingWriter { + valid_write_count: 3 + }; + + dump_compressed_suffix_array(vec![ 1 ], 1, 8, &mut writer).unwrap(); + } + #[test] fn test_load_compressed_suffix_array() { let data = vec![ @@ -131,4 +217,34 @@ mod tests { assert_eq!(compressed_suffix_array.get(i), i as u64 + 1); } } + + #[test] + #[should_panic(expected = "Could not read the sample rate from the binary file")] + fn test_load_compressed_suffix_array_fail_sample_rate() { + let mut reader = FailingReader { + valid_read_count: 0 + }; + + load_compressed_suffix_array(&mut reader, 8).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the size of the suffix array from the binary file")] + fn test_load_compressed_suffix_array_fail_size() { + let mut reader = FailingReader { + valid_read_count: 1 + }; + + load_compressed_suffix_array(&mut reader, 8).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the compressed suffix array from the binary file")] + fn test_load_compressed_suffix_array_fail_compressed_suffix_array() { + let mut reader = FailingReader { + valid_read_count: 2 + }; + + load_compressed_suffix_array(&mut reader, 8).unwrap(); + } } diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index 5fc5932..87e0d0d 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -66,7 +66,7 @@ impl Binary for Vec { let mut buffer = vec![0; 8 * 1024]; loop { - let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer); + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); } @@ -165,7 +165,7 @@ pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Bo /// /// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input /// is reached, and `bytes_read` is the number of bytes read into the buffer. -fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { +fn fill_buffer(input: &mut T, buffer: &mut Vec) -> std::io::Result<(bool, usize)> { // Store the buffer size in advance, because rust will complain // about the buffer being borrowed mutably while it's borrowed let buffer_size = buffer.len(); @@ -177,10 +177,10 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { // No bytes written, which means we've completely filled the buffer // or we've reached the end of the file Ok(0) => { - return ( + return Ok(( !writable_buffer_space.is_empty(), buffer_size - writable_buffer_space.len() - ); + )); } // We've read {bytes_read} bytes @@ -189,8 +189,9 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); } - Err(err) => { - panic!("Error while reading input: {}", err); + // An error occurred while reading + Err(e) => { + return Err(e); } } } @@ -200,12 +201,48 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> (bool, usize) { mod tests { use super::*; - pub struct ErrorInput; + pub struct FailingWriter { + /// The number of times the write function can be called before it fails. + pub valid_write_count: usize + } + + impl Write for FailingWriter { + fn write(&mut self, _: &[u8]) -> Result { + if self.valid_write_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed")); + } + + self.valid_write_count -= 1; + Ok(1) + } + + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + + pub struct FailingReader { + /// The number of times the read function can be called before it fails. + pub valid_read_count: usize + } + + impl Read for FailingReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.valid_read_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed")); + } + + self.valid_read_count -= 1; + Ok(buf.len()) + } + } - impl Read for ErrorInput { - fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { - Err(std::io::Error::new(std::io::ErrorKind::Other, "read error")) + impl BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Ok(&[]) } + + fn consume(&mut self, _: usize) {} } #[test] @@ -216,7 +253,7 @@ mod tests { let mut buffer = vec![0; 800]; loop { - let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer); + let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer).unwrap(); if finished { assert!(bytes_read < 800); @@ -228,12 +265,11 @@ mod tests { } #[test] - #[should_panic(expected = "Error while reading input:")] fn test_fill_buffer_read_error() { - let mut input = ErrorInput; + let mut input = FailingReader { valid_read_count: 0 }; let mut buffer = vec![0; 800]; - fill_buffer(&mut input, &mut buffer); + assert!(fill_buffer(&mut input, &mut buffer).is_err()); } #[test] @@ -285,6 +321,46 @@ mod tests { ); } + #[test] + #[should_panic(expected = "Could not write the required bits to the writer")] + fn test_dump_suffix_array_fail_required_bits() { + let mut writer = FailingWriter { + valid_write_count: 0 + }; + + dump_suffix_array(&vec![], 1, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the sparseness factor to the writer")] + fn test_dump_suffix_array_fail_sparseness_factor() { + let mut writer = FailingWriter { + valid_write_count: 1 + }; + + dump_suffix_array(&vec![], 1, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the size of the suffix array to the writer")] + fn test_dump_suffix_array_fail_size() { + let mut writer = FailingWriter { + valid_write_count: 2 + }; + + dump_suffix_array(&vec![], 1, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the suffix array to the writer")] + fn test_dump_suffix_array_fail_suffix_array() { + let mut writer = FailingWriter { + valid_write_count: 3 + }; + + dump_suffix_array(&vec![ 1 ], 1, &mut writer).unwrap(); + } + #[test] fn test_load_suffix_array() { let buffer = vec![ @@ -301,4 +377,34 @@ mod tests { assert_eq!(sample_rate, 1); assert_eq!(sa, vec![1, 2, 3, 4, 5]); } + + #[test] + #[should_panic(expected = "Could not read the sample rate from the binary file")] + fn test_load_suffix_array_fail_sample_rate() { + let mut reader = FailingReader { + valid_read_count: 0 + }; + + load_suffix_array(&mut reader).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the size of the suffix array from the binary file")] + fn test_load_suffix_array_fail_size() { + let mut reader = FailingReader { + valid_read_count: 1 + }; + + load_suffix_array(&mut reader).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the suffix array from the binary file")] + fn test_load_suffix_array_fail_suffix_array() { + let mut reader = FailingReader { + valid_read_count: 2 + }; + + load_suffix_array(&mut reader).unwrap(); + } } diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index 9e2f769..ca13a82 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -106,6 +106,16 @@ mod tests { assert_eq!(sa.get(4), 5); } + #[test] + fn test_suffix_array_is_empty() { + let sa = SuffixArray::Original(vec![]); + assert_eq!(sa.is_empty(), true); + + let bitarray = BitArray::with_capacity(0, 0); + let sa = SuffixArray::Compressed(bitarray); + assert_eq!(sa.is_empty(), true); + } + #[test] fn test_nullable_is_null() { assert_eq!(u32::NULL.is_null(), true); diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 4144f38..85a245b 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -116,6 +116,7 @@ impl SparseSuffixToProtein { #[cfg(test)] mod tests { + use clap::ValueEnum; use sa_mappings::proteins::{ SEPARATION_CHARACTER, TERMINATION_CHARACTER @@ -123,6 +124,7 @@ mod tests { use crate::{ suffix_to_protein_index::{ + SuffixToProteinMappingStyle, DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex @@ -136,6 +138,12 @@ mod tests { text.into_bytes() } + #[test] + fn test_suffix_to_protein_mapping_style() { + assert_eq!(SuffixToProteinMappingStyle::Dense, SuffixToProteinMappingStyle::from_str("dense", false).unwrap()); + assert_eq!(SuffixToProteinMappingStyle::Sparse, SuffixToProteinMappingStyle::from_str("sparse", false).unwrap()); + } + #[test] fn test_dense_build() { let u8_text = &build_text(); diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs index 8b60eb6..ec91eff 100644 --- a/sa-mappings/src/functionality.rs +++ b/sa-mappings/src/functionality.rs @@ -32,6 +32,9 @@ impl FunctionAggregator { /// /// Returns a JSON string containing the aggregated functional annotations pub fn aggregate(&self, proteins: Vec<&Protein>) -> FunctionalAggregation { + // Keep track of the proteins that have any annotation + let mut proteins_with_annotations: HashSet = HashSet::new(); + // Keep track of the proteins that have a certain annotation let mut proteins_with_ec: HashSet = HashSet::new(); let mut proteins_with_go: HashSet = HashSet::new(); @@ -43,10 +46,19 @@ impl FunctionAggregator { for protein in proteins.iter() { for annotation in protein.get_functional_annotations().split(';') { match annotation.chars().next() { - Some('E') => proteins_with_ec.insert(protein.uniprot_id.clone()), - Some('G') => proteins_with_go.insert(protein.uniprot_id.clone()), - Some('I') => proteins_with_ipr.insert(protein.uniprot_id.clone()), - _ => false + Some('E') => { + proteins_with_ec.insert(protein.uniprot_id.clone()); + proteins_with_annotations.insert(protein.uniprot_id.clone()); + }, + Some('G') => { + proteins_with_go.insert(protein.uniprot_id.clone()); + proteins_with_annotations.insert(protein.uniprot_id.clone()); + }, + Some('I') => { + proteins_with_ipr.insert(protein.uniprot_id.clone()); + proteins_with_annotations.insert(protein.uniprot_id.clone()); + }, + _ => {} }; data.entry(annotation.to_string()) @@ -56,7 +68,7 @@ impl FunctionAggregator { } let mut counts: HashMap = HashMap::new(); - counts.insert("all".to_string(), proteins.len()); + counts.insert("all".to_string(), proteins_with_annotations.len()); counts.insert("EC".to_string(), proteins_with_ec.len()); counts.insert("GO".to_string(), proteins_with_go.len()); counts.insert("IPR".to_string(), proteins_with_ipr.len()); @@ -100,30 +112,51 @@ mod tests { #[test] fn test_aggregate() { let mut proteins: Vec = Vec::new(); - proteins.push(Protein { + + let protein1 = Protein { uniprot_id: "P12345".to_string(), taxon_id: 9606, functional_annotations: encode("GO:0001234;GO:0005678") - }); - proteins.push(Protein { + }; + let protein2 = Protein { uniprot_id: "P23456".to_string(), taxon_id: 9606, functional_annotations: encode("EC:1.1.1.-") - }); + }; + let protein3 = Protein { + uniprot_id: "P23876".to_string(), + taxon_id: 9606, + functional_annotations: encode("IPR:IPR123456;EC:1.1.1.-") + }; + let protein4 = Protein { + uniprot_id: "P23877".to_string(), + taxon_id: 9606, + functional_annotations: encode("2345") + }; + + proteins.push(protein1); + proteins.push(protein2); + proteins.push(protein3); + proteins.push(protein4); let function_aggregator = FunctionAggregator {}; let result = function_aggregator.aggregate(proteins.iter().collect()); - assert_eq!(result.counts.get("all"), Some(&2)); - assert_eq!(result.counts.get("EC"), Some(&1)); + assert_eq!(result.counts.get("all"), Some(&3)); + assert_eq!(result.counts.get("EC"), Some(&2)); assert_eq!(result.counts.get("GO"), Some(&1)); - assert_eq!(result.counts.get("IPR"), Some(&0)); + assert_eq!(result.counts.get("IPR"), Some(&1)); assert_eq!(result.counts.get("NOTHING"), None); - assert_eq!(result.data.get("GO:0001234"), Some(&1)); - assert_eq!(result.data.get("GO:0005678"), Some(&1)); - assert_eq!(result.data.get("EC:1.1.1.-"), Some(&1)); + assert_eq!(result.data, { + let mut map = HashMap::new(); + map.insert("GO:0001234".to_string(), 1); + map.insert("GO:0005678".to_string(), 1); + map.insert("EC:1.1.1.-".to_string(), 2); + map.insert("IPR:IPR123456".to_string(), 1); + map + }); assert_eq!(result.data.get("EC:1.1.2.-"), None); } @@ -141,17 +174,24 @@ mod tests { taxon_id: 9606, functional_annotations: encode("EC:1.1.1.-") }; + let protein3 = Protein { + uniprot_id: "P23876".to_string(), + taxon_id: 9606, + functional_annotations: encode("IPR:IPR123456;EC:1.1.1.-") + }; proteins.push(&protein1); proteins.push(&protein2); + proteins.push(&protein3); let function_aggregator = FunctionAggregator {}; let result = function_aggregator.get_all_functional_annotations(proteins.as_slice()); - assert_eq!(result.len(), 2); + assert_eq!(result.len(), 3); assert_eq!(result[0].len(), 2); assert_eq!(result[1].len(), 1); + assert_eq!(result[2].len(), 2); } #[test] From 679043f52580714d34a70e37aa4c4776c9f62aff Mon Sep 17 00:00:00 2001 From: tibvdm Date: Sat, 18 May 2024 03:28:35 +0200 Subject: [PATCH 15/26] formatting --- bitarray/src/lib.rs | 56 ++++++++++++++----------- sa-builder/src/lib.rs | 27 ++++++++---- sa-compression/src/lib.rs | 2 +- sa-index/src/binary.rs | 6 ++- sa-index/src/suffix_to_protein_index.rs | 14 +++++-- sa-mappings/src/functionality.rs | 6 +-- 6 files changed, 69 insertions(+), 42 deletions(-) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index d2e000c..695aa67 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -2,10 +2,13 @@ mod binary; -use std::{cmp::max, io::{ - Result, - Write -}}; +use std::{ + cmp::max, + io::{ + Result, + Write + } +}; /// Re-export the `Binary` trait. pub use binary::Binary; @@ -34,7 +37,11 @@ impl BitArray { /// /// A new `BitArray` with the specified capacity. pub fn with_capacity(capacity: usize, bits_per_value: usize) -> Self { - let extra = if capacity * bits_per_value % 64 == 0 { 0 } else { 1 }; + let extra = if capacity * bits_per_value % 64 == 0 { + 0 + } else { + 1 + }; Self { data: vec![0; capacity * bits_per_value / 64 + extra], mask: (1 << bits_per_value) - 1, @@ -160,9 +167,11 @@ pub fn data_to_writer( // Update the max capacity to be a multiple of the greatest common divisor of the bits per value // and 64. This is done to ensure that the bit array can store the data entirely let greates_common_divisor = gcd(bits_per_value, 64); - let capacity = max(greates_common_divisor, max_capacity / greates_common_divisor * greates_common_divisor); + let capacity = + max(greates_common_divisor, max_capacity / greates_common_divisor * greates_common_divisor); - // If amount of data is less than the max capacity, write the data to the writer in a single chunk + // If amount of data is less than the max capacity, write the data to the writer in a single + // chunk if data.len() <= capacity { let mut bitarray = BitArray::with_capacity(data.len(), bits_per_value); @@ -203,21 +212,21 @@ pub fn data_to_writer( } /// Calculates the greatest common divisor of two numbers. -/// +/// /// # Arguments -/// +/// /// * `a` - The first number. /// * `b` - The second number. -/// +/// /// # Returns -/// +/// /// The greatest common divisor of the two numbers. fn gcd(mut a: usize, mut b: usize) -> usize { while b != 0 { - if b < a { - std::mem::swap(&mut b, &mut a); - } - b %= a; + if b < a { + std::mem::swap(&mut b, &mut a); + } + b %= a; } a } @@ -305,7 +314,7 @@ mod tests { fn test_data_to_writer_chunks_needed_no_remainder() { let data = vec![ 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, - 0x88888888 + 0x88888888, ]; let mut writer = Vec::new(); @@ -314,9 +323,9 @@ mod tests { assert_eq!( writer, vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, - 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, - 0x88, 0x88, 0x77, 0x77, 0x77, 0x77 + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, + 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, + 0x77, 0x77, 0x77, 0x77 ] ); } @@ -325,7 +334,7 @@ mod tests { fn test_data_to_writer_chunks_needed_plus_remainder() { let data = vec![ 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, - 0x88888888, 0x99999999 + 0x88888888, 0x99999999, ]; let mut writer = Vec::new(); @@ -334,10 +343,9 @@ mod tests { assert_eq!( writer, vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, - 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, - 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0x00, 0x00, 0x00, 0x00, 0x99, 0x99, 0x99, - 0x99 + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, + 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, + 0x77, 0x77, 0x77, 0x77, 0x00, 0x00, 0x00, 0x00, 0x99, 0x99, 0x99, 0x99 ] ); } diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 1e6d834..57a4cfa 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -122,12 +122,17 @@ mod tests { #[test] fn test_arguments() { let args = Arguments::parse_from(&[ - "sa-builder", - "--database-file", "database.fa", - "--taxonomy", "taxonomy.tsv", - "--output", "output.fa", - "--sparseness-factor", "2", - "--construction-algorithm", "lib-div-suf-sort", + "sa-builder", + "--database-file", + "database.fa", + "--taxonomy", + "taxonomy.tsv", + "--output", + "output.fa", + "--sparseness-factor", + "2", + "--construction-algorithm", + "lib-div-suf-sort", "--compress-sa" ]); @@ -141,8 +146,14 @@ mod tests { #[test] fn test_sa_construction_algorithm() { - assert_eq!(SAConstructionAlgorithm::from_str("lib-div-suf-sort", false), Ok(SAConstructionAlgorithm::LibDivSufSort)); - assert_eq!(SAConstructionAlgorithm::from_str("lib-sais", false), Ok(SAConstructionAlgorithm::LibSais)); + assert_eq!( + SAConstructionAlgorithm::from_str("lib-div-suf-sort", false), + Ok(SAConstructionAlgorithm::LibDivSufSort) + ); + assert_eq!( + SAConstructionAlgorithm::from_str("lib-sais", false), + Ok(SAConstructionAlgorithm::LibSais) + ); } #[test] diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index 1148aca..d4da1eb 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -196,7 +196,7 @@ mod tests { valid_write_count: 3 }; - dump_compressed_suffix_array(vec![ 1 ], 1, 8, &mut writer).unwrap(); + dump_compressed_suffix_array(vec![1], 1, 8, &mut writer).unwrap(); } #[test] diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index 87e0d0d..ae16a8f 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -266,7 +266,9 @@ mod tests { #[test] fn test_fill_buffer_read_error() { - let mut input = FailingReader { valid_read_count: 0 }; + let mut input = FailingReader { + valid_read_count: 0 + }; let mut buffer = vec![0; 800]; assert!(fill_buffer(&mut input, &mut buffer).is_err()); @@ -358,7 +360,7 @@ mod tests { valid_write_count: 3 }; - dump_suffix_array(&vec![ 1 ], 1, &mut writer).unwrap(); + dump_suffix_array(&vec![1], 1, &mut writer).unwrap(); } #[test] diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 85a245b..0091fed 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -124,10 +124,10 @@ mod tests { use crate::{ suffix_to_protein_index::{ - SuffixToProteinMappingStyle, DenseSuffixToProtein, SparseSuffixToProtein, - SuffixToProteinIndex + SuffixToProteinIndex, + SuffixToProteinMappingStyle }, Nullable }; @@ -140,8 +140,14 @@ mod tests { #[test] fn test_suffix_to_protein_mapping_style() { - assert_eq!(SuffixToProteinMappingStyle::Dense, SuffixToProteinMappingStyle::from_str("dense", false).unwrap()); - assert_eq!(SuffixToProteinMappingStyle::Sparse, SuffixToProteinMappingStyle::from_str("sparse", false).unwrap()); + assert_eq!( + SuffixToProteinMappingStyle::Dense, + SuffixToProteinMappingStyle::from_str("dense", false).unwrap() + ); + assert_eq!( + SuffixToProteinMappingStyle::Sparse, + SuffixToProteinMappingStyle::from_str("sparse", false).unwrap() + ); } #[test] diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs index ec91eff..7a64150 100644 --- a/sa-mappings/src/functionality.rs +++ b/sa-mappings/src/functionality.rs @@ -49,15 +49,15 @@ impl FunctionAggregator { Some('E') => { proteins_with_ec.insert(protein.uniprot_id.clone()); proteins_with_annotations.insert(protein.uniprot_id.clone()); - }, + } Some('G') => { proteins_with_go.insert(protein.uniprot_id.clone()); proteins_with_annotations.insert(protein.uniprot_id.clone()); - }, + } Some('I') => { proteins_with_ipr.insert(protein.uniprot_id.clone()); proteins_with_annotations.insert(protein.uniprot_id.clone()); - }, + } _ => {} }; From a5638bb05fc37e7c23aa0b1aa562c162e93729d5 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Sat, 18 May 2024 04:10:39 +0200 Subject: [PATCH 16/26] fix bitarray tests --- bitarray/src/lib.rs | 43 +++++++++++++++++++++++++++++--- sa-mappings/src/functionality.rs | 4 +-- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 695aa67..ce2d9ef 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -314,7 +314,15 @@ mod tests { fn test_data_to_writer_chunks_needed_no_remainder() { let data = vec![ 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, - 0x88888888, + 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, + 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, + 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, + 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, + 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, + 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, + 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, + 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, + 0x00000000, ]; let mut writer = Vec::new(); @@ -325,7 +333,23 @@ mod tests { vec![ 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, - 0x77, 0x77, 0x77, 0x77 + 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, + 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, + 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, + 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, + 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, + 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, + 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, + 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, + 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, + 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, + 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, + 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff ] ); } @@ -334,7 +358,10 @@ mod tests { fn test_data_to_writer_chunks_needed_plus_remainder() { let data = vec![ 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, - 0x88888888, 0x99999999, + 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, + 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, + 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, + 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, ]; let mut writer = Vec::new(); @@ -345,7 +372,15 @@ mod tests { vec![ 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, - 0x77, 0x77, 0x77, 0x77, 0x00, 0x00, 0x00, 0x00, 0x99, 0x99, 0x99, 0x99 + 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, + 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, + 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, + 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, + 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, + 0x33, 0x33, 0x33, 0x33 ] ); } diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs index 7a64150..45c446e 100644 --- a/sa-mappings/src/functionality.rs +++ b/sa-mappings/src/functionality.rs @@ -14,9 +14,9 @@ use crate::proteins::Protein; #[derive(Debug, Serialize)] pub struct FunctionalAggregation { /// A HashMap representing how many GO, EC and IPR terms were found - pub counts: HashMap, + counts: HashMap, /// A HashMap representing how often a certain functional annotation was found - pub data: HashMap + data: HashMap } /// A struct that represents a function aggregator From 41d6c45f5b07429d5839307ee239243efe533e5a Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 22 May 2024 12:04:51 +0200 Subject: [PATCH 17/26] some additional smaller tests --- Cargo.lock | 2 + sa-index/Cargo.toml | 4 + sa-index/out.txt | 1515 ++++++++++++++++++++++++++++++++ sa-index/src/peptide_search.rs | 70 ++ sa-index/src/sa_searcher.rs | 40 +- sa-mappings/src/taxonomy.rs | 74 +- 6 files changed, 1676 insertions(+), 29 deletions(-) create mode 100644 sa-index/out.txt diff --git a/Cargo.lock b/Cargo.lock index 3f82911..8b2e119 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1430,10 +1430,12 @@ version = "0.1.0" dependencies = [ "bitarray", "clap 4.5.4", + "fa-compression", "rayon", "sa-mappings", "serde", "serde_json", + "tempdir", "umgap", ] diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index 986dc3c..557549c 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -5,6 +5,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dev-dependencies] +tempdir = "0.3.7" +fa-compression = { path = "../fa-compression" } + [dependencies] clap = { version = "4.4.8", features = ["derive"] } umgap = "1.1.0" diff --git a/sa-index/out.txt b/sa-index/out.txt new file mode 100644 index 0000000..0bd0be1 --- /dev/null +++ b/sa-index/out.txt @@ -0,0 +1,1515 @@ +#![feature(prelude_import)] +#[prelude_import] +use std::prelude::rust_2021::*; +#[macro_use] +extern crate std; +use bitarray::BitArray; +pub mod binary { + use std::{error::Error, io::{BufRead, Read, Write}}; + /// The `Binary` trait provides methods for reading and writing a struct as binary. + pub trait Binary { + /// Writes the struct as binary to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. + fn write_binary(&self, writer: &mut W) -> std::io::Result<()>; + /// Reads binary data into a struct from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader to read the binary data from. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `Err` if an error occurs. + fn read_binary(&mut self, reader: R) -> std::io::Result<()>; + } + /// Implements the `Binary` trait for `Vec`. + impl Binary for Vec { + /// Writes the elements of the vector to a binary file. + /// + /// # Arguments + /// + /// * `writer` - The writer to which the binary data will be written. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `std::io::Error` otherwise. + fn write_binary(&self, writer: &mut W) -> std::io::Result<()> { + for value in self { + writer.write_all(&value.to_le_bytes())?; + } + Ok(()) + } + /// Reads binary data from a reader and populates the vector with the read values. + /// + /// # Arguments + /// + /// * `reader` - The reader from which the binary data will be read. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `std::io::Error` otherwise. + fn read_binary(&mut self, mut reader: R) -> std::io::Result<()> { + self.clear(); + let mut buffer = ::alloc::vec::from_elem(0, 8 * 1024); + loop { + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); + } + if finished { + break; + } + } + Ok(()) + } + } + /// Writes the suffix array to a binary file. + /// + /// # Arguments + /// + /// * `sa` - The suffix array to dump. + /// * `sparseness_factor` - The sparseness factor to write to the file. + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. + pub fn dump_suffix_array( + sa: &Vec, + sparseness_factor: u8, + writer: &mut impl Write, + ) -> Result<(), Box> { + writer + .write(&[64_u8]) + .map_err(|_| "Could not write the required bits to the writer")?; + writer + .write(&[sparseness_factor]) + .map_err(|_| "Could not write the sparseness factor to the writer")?; + let sa_len = sa.len(); + writer + .write(&(sa_len).to_le_bytes()) + .map_err(|_| "Could not write the size of the suffix array to the writer")?; + sa.write_binary(writer) + .map_err(|_| "Could not write the suffix array to the writer")?; + Ok(()) + } + /// Loads the suffix array from the file with the given `filename` + /// + /// # Arguments + /// * `filename` - The filename of the file where the suffix array is stored + /// + /// # Returns + /// + /// Returns the sample rate of the suffix array, together with the suffix array + /// + /// # Errors + /// + /// Returns any error from opening the file or reading the file + pub fn load_suffix_array( + reader: &mut impl BufRead, + ) -> Result<(u8, Vec), Box> { + let mut sample_rate_buffer = [0_u8; 1]; + reader + .read_exact(&mut sample_rate_buffer) + .map_err(|_| "Could not read the sample rate from the binary file")?; + let sample_rate = sample_rate_buffer[0]; + let mut size_buffer = [0_u8; 8]; + reader + .read_exact(&mut size_buffer) + .map_err(|_| { + "Could not read the size of the suffix array from the binary file" + })?; + let size = u64::from_le_bytes(size_buffer) as usize; + let mut sa = Vec::with_capacity(size); + sa.read_binary(reader) + .map_err(|_| "Could not read the suffix array from the binary file")?; + Ok((sample_rate, sa)) + } + /// Fills the buffer with data read from the input. + /// + /// # Arguments + /// + /// * `input` - The input source to read data from. + /// * `buffer` - The buffer to fill with data. + /// + /// # Returns + /// + /// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input + /// is reached, and `bytes_read` is the number of bytes read into the buffer. + fn fill_buffer( + input: &mut T, + buffer: &mut Vec, + ) -> std::io::Result<(bool, usize)> { + let buffer_size = buffer.len(); + let mut writable_buffer_space = buffer.as_mut(); + loop { + match input.read(writable_buffer_space) { + Ok(0) => { + return Ok(( + !writable_buffer_space.is_empty(), + buffer_size - writable_buffer_space.len(), + )); + } + Ok(bytes_read) => { + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + } + Err(e) => { + return Err(e); + } + } + } + } +} +pub mod peptide_search { + use rayon::prelude::*; + use sa_mappings::{functionality::FunctionalAggregation, proteins::Protein}; + use serde::Serialize; + use crate::sa_searcher::{SearchAllSuffixesResult, Searcher}; + /// Struct representing a collection of `SearchResultWithAnalysis` or `SearchOnlyResult` results + pub struct OutputData { + result: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for OutputData { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field1_finish( + f, + "OutputData", + "result", + &&self.result, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for OutputData + where + T: _serde::Serialize, + { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "OutputData", + false as usize + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "result", + &self.result, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Struct representing the search result of the `sequence` in the index, including the analyses + pub struct SearchResultWithAnalysis { + sequence: String, + lca: Option, + taxa: Vec, + uniprot_accession_numbers: Vec, + fa: Option, + cutoff_used: bool, + } + #[automatically_derived] + impl ::core::fmt::Debug for SearchResultWithAnalysis { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + let names: &'static _ = &[ + "sequence", + "lca", + "taxa", + "uniprot_accession_numbers", + "fa", + "cutoff_used", + ]; + let values: &[&dyn ::core::fmt::Debug] = &[ + &self.sequence, + &self.lca, + &self.taxa, + &self.uniprot_accession_numbers, + &self.fa, + &&self.cutoff_used, + ]; + ::core::fmt::Formatter::debug_struct_fields_finish( + f, + "SearchResultWithAnalysis", + names, + values, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for SearchResultWithAnalysis { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "SearchResultWithAnalysis", + false as usize + 1 + 1 + 1 + 1 + 1 + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "sequence", + &self.sequence, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "lca", + &self.lca, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "taxa", + &self.taxa, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "uniprot_accession_numbers", + &self.uniprot_accession_numbers, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "fa", + &self.fa, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "cutoff_used", + &self.cutoff_used, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Struct representing the search result of the `sequence` in the index (without the analyses) + pub struct SearchOnlyResult { + sequence: String, + proteins: Vec, + cutoff_used: bool, + } + #[automatically_derived] + impl ::core::fmt::Debug for SearchOnlyResult { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field3_finish( + f, + "SearchOnlyResult", + "sequence", + &self.sequence, + "proteins", + &self.proteins, + "cutoff_used", + &&self.cutoff_used, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for SearchOnlyResult { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "SearchOnlyResult", + false as usize + 1 + 1 + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "sequence", + &self.sequence, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "proteins", + &self.proteins, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "cutoff_used", + &self.cutoff_used, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Struct that represents all information known about a certain protein in our database + pub struct ProteinInfo { + taxon: usize, + uniprot_accession: String, + functional_annotations: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for ProteinInfo { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field3_finish( + f, + "ProteinInfo", + "taxon", + &self.taxon, + "uniprot_accession", + &self.uniprot_accession, + "functional_annotations", + &&self.functional_annotations, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for ProteinInfo { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "ProteinInfo", + false as usize + 1 + 1 + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "taxon", + &self.taxon, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "uniprot_accession", + &self.uniprot_accession, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "functional_annotations", + &self.functional_annotations, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Searches the `peptide` in the index multithreaded and retrieves the matching proteins + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptide` - The peptide that is being searched in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns Some if matches are found. + /// The first argument is true if the cutoff is used, otherwise false + /// The second argument is a list of all matching proteins for the peptide + /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the + /// sparseness factor k used in the index + pub fn search_proteins_for_peptide<'a>( + searcher: &'a Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> Option<(bool, Vec<&'a Protein>)> { + let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); + if peptide.len() < searcher.sparseness_factor as usize { + return None; + } + let suffix_search = searcher + .search_matching_suffixes(peptide.as_bytes(), cutoff, equalize_i_and_l); + let mut cutoff_used = false; + let suffixes = match suffix_search { + SearchAllSuffixesResult::MaxMatches(matched_suffixes) => { + cutoff_used = true; + matched_suffixes + } + SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, + SearchAllSuffixesResult::NoMatches => { + return None; + } + }; + let mut proteins = searcher.retrieve_proteins(&suffixes); + if clean_taxa { + proteins.retain(|protein| searcher.taxon_valid(protein)) + } + Some((cutoff_used, proteins)) + } + /// Searches the `peptide` in the index multithreaded and retrieves the protein information from the + /// database This does NOT perform any of the analyses, it only retrieves the functional and + /// taxonomic annotations + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptide` - The peptide that is being searched in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns Some(SearchOnlyResult) if the peptide has matches + /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the + /// sparseness factor k used in the index + pub fn search_peptide_retrieve_annotations( + searcher: &Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> Option { + let (cutoff_used, proteins) = search_proteins_for_peptide( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + )?; + let annotations = searcher.get_all_functional_annotations(&proteins); + let mut protein_info: Vec = ::alloc::vec::Vec::new(); + for (&protein, annotations) in proteins.iter().zip(annotations) { + protein_info + .push(ProteinInfo { + taxon: protein.taxon_id, + uniprot_accession: protein.uniprot_id.clone(), + functional_annotations: annotations, + }) + } + Some(SearchOnlyResult { + sequence: peptide.to_string(), + proteins: protein_info, + cutoff_used, + }) + } + /// Searches the `peptide` in the index multithreaded and performs the taxonomic and functional + /// analyses + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptide` - The peptide that is being searched in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns Some(SearchResultWithAnalysis) if the peptide has matches + /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the + /// sparseness factor k used in the index + pub fn analyse_peptide( + searcher: &Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> Option { + let (cutoff_used, mut proteins) = search_proteins_for_peptide( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + )?; + if clean_taxa { + proteins.retain(|protein| searcher.taxon_valid(protein)) + } + let lca = if cutoff_used { Some(1) } else { searcher.retrieve_lca(&proteins) }; + lca?; + let mut uniprot_accession_numbers = ::alloc::vec::Vec::new(); + let mut taxa = ::alloc::vec::Vec::new(); + for protein in &proteins { + taxa.push(protein.taxon_id); + uniprot_accession_numbers.push(protein.uniprot_id.clone()); + } + let fa = searcher.retrieve_function(&proteins); + Some(SearchResultWithAnalysis { + sequence: peptide.to_string(), + lca, + cutoff_used, + uniprot_accession_numbers, + taxa, + fa, + }) + } + /// Searches the list of `peptides` in the index multithreaded and performs the functional and + /// taxonomic analyses + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptides` - List of peptides we want to search in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns an `OutputData` object with the search and analyses results + /// for the peptides + pub fn analyse_all_peptides( + searcher: &Searcher, + peptides: &Vec, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> OutputData { + let res: Vec = peptides + .par_iter() + .map(|peptide| analyse_peptide( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + )) + .filter_map(|search_result| search_result) + .collect(); + OutputData { result: res } + } + /// Searches the list of `peptides` in the index and retrieves all related information about the + /// found proteins This does NOT perform any of the analyses + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptides` - List of peptides we want to search in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns an `OutputData` object with the search results for the peptides + pub fn search_all_peptides( + searcher: &Searcher, + peptides: &Vec, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> OutputData { + let res: Vec = peptides + .par_iter() + .map(|peptide| { + search_peptide_retrieve_annotations( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + ) + }) + .filter_map(|search_result| search_result) + .collect(); + OutputData { result: res } + } +} +pub mod sa_searcher { + use std::cmp::min; + use sa_mappings::{ + functionality::{FunctionAggregator, FunctionalAggregation}, + proteins::{Protein, Proteins}, + taxonomy::TaxonAggregator, + }; + use umgap::taxon::TaxonId; + use crate::{ + define_struct, sa_searcher::BoundSearch::{Maximum, Minimum}, + suffix_to_protein_index::SuffixToProteinIndex, Nullable, SuffixArray, + }; + /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array + enum BoundSearch { + Minimum, + Maximum, + } + #[automatically_derived] + impl ::core::clone::Clone for BoundSearch { + #[inline] + fn clone(&self) -> BoundSearch { + *self + } + } + #[automatically_derived] + impl ::core::marker::Copy for BoundSearch {} + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for BoundSearch {} + #[automatically_derived] + impl ::core::cmp::PartialEq for BoundSearch { + #[inline] + fn eq(&self, other: &BoundSearch) -> bool { + let __self_tag = ::core::intrinsics::discriminant_value(self); + let __arg1_tag = ::core::intrinsics::discriminant_value(other); + __self_tag == __arg1_tag + } + } + /// Enum representing the minimum and maximum bound of the found matches in the suffix array + pub enum BoundSearchResult { + NoMatches, + SearchResult((usize, usize)), + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for BoundSearchResult {} + #[automatically_derived] + impl ::core::cmp::PartialEq for BoundSearchResult { + #[inline] + fn eq(&self, other: &BoundSearchResult) -> bool { + let __self_tag = ::core::intrinsics::discriminant_value(self); + let __arg1_tag = ::core::intrinsics::discriminant_value(other); + __self_tag == __arg1_tag + && match (self, other) { + ( + BoundSearchResult::SearchResult(__self_0), + BoundSearchResult::SearchResult(__arg1_0), + ) => *__self_0 == *__arg1_0, + _ => true, + } + } + } + #[automatically_derived] + impl ::core::fmt::Debug for BoundSearchResult { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + match self { + BoundSearchResult::NoMatches => { + ::core::fmt::Formatter::write_str(f, "NoMatches") + } + BoundSearchResult::SearchResult(__self_0) => { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "SearchResult", + &__self_0, + ) + } + } + } + } + /// Enum representing the matching suffixes after searching a peptide in the suffix array + /// Both the MaxMatches and SearchResult indicate found suffixes, but MaxMatches is used when the + /// cutoff is reached. + pub enum SearchAllSuffixesResult { + NoMatches, + MaxMatches(Vec), + SearchResult(Vec), + } + #[automatically_derived] + impl ::core::fmt::Debug for SearchAllSuffixesResult { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + match self { + SearchAllSuffixesResult::NoMatches => { + ::core::fmt::Formatter::write_str(f, "NoMatches") + } + SearchAllSuffixesResult::MaxMatches(__self_0) => { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "MaxMatches", + &__self_0, + ) + } + SearchAllSuffixesResult::SearchResult(__self_0) => { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "SearchResult", + &__self_0, + ) + } + } + } + } + /// Custom implementation of partialEq for SearchAllSuffixesResult + /// We consider 2 SearchAllSuffixesResult equal if they exist of the same key, and the Vec contains + /// the same values, but the order can be different + impl PartialEq for SearchAllSuffixesResult { + fn eq(&self, other: &Self) -> bool { + /// Returns true if `arr1` and `arr2` contains the same elements, the order of the elements + /// is ignored + /// + /// # Arguments + /// * `arr1` - The first array used in the comparison + /// * `arr2` - The second array used in the comparison + /// + /// # Returns + /// + /// Returns true if arr1 and arr2 contains the same elements, the order of the elements is + /// ignored + fn array_eq_unordered(arr1: &[i64], arr2: &[i64]) -> bool { + let mut arr1_copy = arr1.to_owned(); + let mut arr2_copy = arr2.to_owned(); + arr1_copy.sort(); + arr2_copy.sort(); + arr1_copy == arr2_copy + } + match (self, other) { + ( + SearchAllSuffixesResult::MaxMatches(arr1), + SearchAllSuffixesResult::MaxMatches(arr2), + ) => array_eq_unordered(arr1, arr2), + ( + SearchAllSuffixesResult::SearchResult(arr1), + SearchAllSuffixesResult::SearchResult(arr2), + ) => array_eq_unordered(arr1, arr2), + ( + SearchAllSuffixesResult::NoMatches, + SearchAllSuffixesResult::NoMatches, + ) => true, + _ => false, + } + } + } + /// Struct that contains all the elements needed to search a peptide in the suffix array + /// This struct also contains all the functions used for search + /// + /// # Arguments + /// * `sa` - The sparse suffix array representing the protein database + /// * `sparseness_factor` - The sparseness factor used by the suffix array + /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix is + /// part of + /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the + /// taxonomic analysis provided by Unipept + /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate + /// the functional analysis provided by Unipept + pub struct Searcher {} + impl Searcher { + /// Creates a new Searcher object + /// + /// # Arguments + /// * `sa` - The sparse suffix array representing the protein database + /// * `sparseness_factor` - The sparseness factor used by the suffix array + /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix + /// is part of + /// * `proteins` - List of all the proteins where the suffix array is build on + /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the + /// taxonomic analysis provided by Unipept + /// * `function_aggregator` - Object used to retrieve the functional annotations and to + /// calculate the functional analysis provided by Unipept + /// + /// # Returns + /// + /// Returns a new Searcher object + pub fn new( + sa: SuffixArray, + sparseness_factor: u8, + suffix_index_to_protein: Box, + proteins: Proteins, + taxon_id_calculator: TaxonAggregator, + function_aggregator: FunctionAggregator, + ) -> Self { + Self { + sa, + sparseness_factor, + suffix_index_to_protein, + proteins, + taxon_id_calculator, + function_aggregator, + } + } + /// Compares the `search_string` to the `suffix` + /// During search this function performs extra logic since the suffix array is build with I == + /// L, while ` self.proteins.input_string` is the original text where I != L + /// + /// # Arguments + /// * `search_string` - The string/peptide being searched in the suffix array + /// * `suffix` - The current suffix from the suffix array we are comparing with in the binary + /// search + /// * `skip` - How many characters we can skip in the comparison because we already know these + /// match + /// * `bound` - Indicates if we are searching for the min of max bound + /// + /// # Returns + /// + /// The first argument is true if `bound` == `Minimum` and `search_string` <= `suffix` or if + /// `bound` == `Maximum` and `search_string` >= `suffix` The second argument indicates how + /// far the `suffix` and `search_string` matched + fn compare( + &self, + search_string: &[u8], + suffix: i64, + skip: usize, + bound: BoundSearch, + ) -> (bool, usize) { + let mut index_in_suffix = (suffix as usize) + skip; + let mut index_in_search_string = skip; + let mut is_cond_or_equal = false; + let condition_check = match bound { + Minimum => |a: u8, b: u8| a < b, + Maximum => |a: u8, b: u8| a > b, + }; + while index_in_search_string < search_string.len() + && index_in_suffix < self.proteins.input_string.len() + && (search_string[index_in_search_string] + == self.proteins.input_string[index_in_suffix] + || (search_string[index_in_search_string] == b'L' + && self.proteins.input_string[index_in_suffix] == b'I') + || (search_string[index_in_search_string] == b'I' + && self.proteins.input_string[index_in_suffix] == b'L')) + { + index_in_suffix += 1; + index_in_search_string += 1; + } + if !search_string.is_empty() { + if index_in_search_string == search_string.len() { + is_cond_or_equal = true + } else if index_in_suffix < self.proteins.input_string.len() { + let peptide_char = if search_string[index_in_search_string] == b'L' { + b'I' + } else { + search_string[index_in_search_string] + }; + let protein_char = if self.proteins.input_string[index_in_suffix] + == b'L' + { + b'I' + } else { + self.proteins.input_string[index_in_suffix] + }; + is_cond_or_equal = condition_check(peptide_char, protein_char); + } + } + (is_cond_or_equal, index_in_search_string) + } + /// Searches for the minimum or maximum bound for a string in the suffix array + /// + /// # Arguments + /// * `bound` - Indicates if we are searching the minimum or maximum bound + /// * `search_string` - The string/peptide we are searching in the suffix array + /// + /// # Returns + /// + /// The first argument is true if a match was found + /// The second argument indicates the index of the minimum or maximum bound for the match + /// (depending on `bound`) + fn binary_search_bound( + &self, + bound: BoundSearch, + search_string: &[u8], + ) -> (bool, usize) { + let mut left: usize = 0; + let mut right: usize = self.sa.len(); + let mut lcp_left: usize = 0; + let mut lcp_right: usize = 0; + let mut found = false; + while right - left > 1 { + let center = (left + right) / 2; + let skip = min(lcp_left, lcp_right); + let (retval, lcp_center) = self + .compare(search_string, self.sa.get(center), skip, bound); + found |= lcp_center == search_string.len(); + if retval && bound == Minimum || !retval && bound == Maximum { + right = center; + lcp_right = lcp_center; + } else { + left = center; + lcp_left = lcp_center; + } + } + if right == 1 && left == 0 { + let (retval, lcp_center) = self + .compare( + search_string, + self.sa.get(0), + min(lcp_left, lcp_right), + bound, + ); + found |= lcp_center == search_string.len(); + if bound == Minimum && retval { + right = 0; + } + } + match bound { + Minimum => (found, right), + Maximum => (found, left), + } + } + /// Searches for the minimum and maximum bound for a string in the suffix array + /// + /// # Arguments + /// * `search_string` - The string/peptide we are searching in the suffix array + /// + /// # Returns + /// + /// Returns the minimum and maximum bound of all matches in the suffix array, or `NoMatches` if + /// no matches were found + pub fn search_bounds(&self, search_string: &[u8]) -> BoundSearchResult { + let (found_min, min_bound) = self + .binary_search_bound(Minimum, search_string); + if !found_min { + return BoundSearchResult::NoMatches; + } + let (_, max_bound) = self.binary_search_bound(Maximum, search_string); + BoundSearchResult::SearchResult((min_bound, max_bound + 1)) + } + /// Searches for the suffixes matching a search string + /// During search I and L can be equated + /// + /// # Arguments + /// * `search_string` - The string/peptide we are searching in the suffix array + /// * `max_matches` - The maximum amount of matches processed, if more matches are found we + /// don't process them + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns all the matching suffixes + #[inline] + pub fn search_matching_suffixes( + &self, + search_string: &[u8], + max_matches: usize, + equalize_i_and_l: bool, + ) -> SearchAllSuffixesResult { + let mut matching_suffixes: Vec = ::alloc::vec::Vec::new(); + let mut il_locations = ::alloc::vec::Vec::new(); + for (i, &character) in search_string.iter().enumerate() { + if character == b'I' || character == b'L' { + il_locations.push(i); + } + } + let mut skip: usize = 0; + while skip < self.sparseness_factor as usize { + let mut il_locations_start = 0; + while il_locations_start < il_locations.len() + && il_locations[il_locations_start] < skip + { + il_locations_start += 1; + } + let il_locations_current_suffix = &il_locations[il_locations_start..]; + let current_search_string_prefix = &search_string[..skip]; + let current_search_string_suffix = &search_string[skip..]; + let search_bound_result = self.search_bounds(&search_string[skip..]); + if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result { + let mut sa_index = min_bound; + while sa_index < max_bound { + let suffix = self.sa.get(sa_index) as usize; + if suffix >= skip + && ((skip == 0 + || Self::check_prefix( + current_search_string_prefix, + &self.proteins.input_string[suffix - skip..suffix], + equalize_i_and_l, + )) + && Self::check_suffix( + skip, + il_locations_current_suffix, + current_search_string_suffix, + &self + .proteins + .input_string[suffix..suffix + search_string.len() - skip], + equalize_i_and_l, + )) + { + matching_suffixes.push((suffix - skip) as i64); + if matching_suffixes.len() >= max_matches { + return SearchAllSuffixesResult::MaxMatches( + matching_suffixes, + ); + } + } + sa_index += 1; + } + } + skip += 1; + } + if matching_suffixes.is_empty() { + SearchAllSuffixesResult::NoMatches + } else { + SearchAllSuffixesResult::SearchResult(matching_suffixes) + } + } + /// Returns true of the prefixes are the same + /// if `equalize_i_and_l` is set to true, L and I are considered the same + /// + /// # Arguments + /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched + /// * `index_prefix` - The unchecked prefix from the protein from the suffix array + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise + /// false + #[inline] + fn check_prefix( + search_string_prefix: &[u8], + index_prefix: &[u8], + equalize_i_and_l: bool, + ) -> bool { + if equalize_i_and_l { + search_string_prefix + .iter() + .zip(index_prefix) + .all(|(&search_character, &index_character)| { + search_character == index_character + || (search_character == b'I' && index_character == b'L') + || (search_character == b'L' && index_character == b'I') + }) + } else { + search_string_prefix == index_prefix + } + } + /// Returns true of the search_string and index_string are equal + /// This is automatically true if `equalize_i_and_l` is set to true, since there matched during + /// search where I = L If `equalize_i_and_l` is set to false, we need to check if the I and + /// L locations have the same character + /// + /// # Arguments + /// * `skip` - The used skip factor during the search iteration + /// * `il_locations` - The locations of the I's and L's in the **original** peptide + /// * `search_string` - The peptide that is being searched, but already with the skipped prefix + /// removed from it + /// * `index_string` - The suffix that search_string matches with when I and L were equalized + /// during search + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns true if `search_string` and `index_string` are considered the same, otherwise false + fn check_suffix( + skip: usize, + il_locations: &[usize], + search_string: &[u8], + index_string: &[u8], + equalize_i_and_l: bool, + ) -> bool { + if equalize_i_and_l { + true + } else { + for &il_location in il_locations { + let index = il_location - skip; + if search_string[index] != index_string[index] { + return false; + } + } + true + } + } + /// Returns all the proteins that correspond with the provided suffixes + /// + /// # Arguments + /// * `suffixes` - List of suffix indices + /// + /// # Returns + /// + /// Returns the proteins that every suffix is a part of + #[inline] + pub fn retrieve_proteins(&self, suffixes: &Vec) -> Vec<&Protein> { + let mut res = ::alloc::vec::Vec::new(); + for &suffix in suffixes { + let protein_index = self + .suffix_index_to_protein + .suffix_to_protein(suffix); + if !protein_index.is_null() { + res.push(&self.proteins[protein_index as usize]); + } + } + res + } + /// Searches all the matching proteins for a search_string/peptide in the suffix array + /// + /// # Arguments + /// * `search_string` - The string/peptide being searched + /// * `equalize_i_and_l` - If set to true, I and L are equalized during search + /// + /// # Returns + /// + /// Returns the matching proteins for the search_string + pub fn search_proteins_for_peptide( + &self, + search_string: &[u8], + equalize_i_and_l: bool, + ) -> Vec<&Protein> { + let mut matching_suffixes = ::alloc::vec::Vec::new(); + if let SearchAllSuffixesResult::SearchResult(suffixes) = self + .search_matching_suffixes(search_string, usize::MAX, equalize_i_and_l) + { + matching_suffixes = suffixes; + } + self.retrieve_proteins(&matching_suffixes) + } + /// Retrieves the taxonomic analysis for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns the taxonomic analysis result for the given list of proteins + #[inline] + pub fn retrieve_lca(&self, proteins: &[&Protein]) -> Option { + let taxon_ids: Vec = proteins + .iter() + .map(|prot| prot.taxon_id) + .collect(); + self.taxon_id_calculator + .aggregate(taxon_ids) + .map(|id| self.taxon_id_calculator.snap_taxon(id)) + } + /// Returns true if the protein is considered valid by the provided taxonomy file + /// + /// # Arguments + /// * `protein` - A protein of which we want to check the validity + /// + /// # Returns + /// + /// Returns true if the protein is considered valid by the provided taxonomy file + pub fn taxon_valid(&self, protein: &Protein) -> bool { + self.taxon_id_calculator.taxon_valid(protein.taxon_id) + } + /// Retrieves the functional analysis for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns the functional analysis result for the given list of proteins + pub fn retrieve_function( + &self, + proteins: &[&Protein], + ) -> Option { + let res = self.function_aggregator.aggregate(proteins.to_vec()); + Some(res) + } + /// Retrieves the all the functional annotations for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns all functional annotations for a collection of proteins + pub fn get_all_functional_annotations( + &self, + proteins: &[&Protein], + ) -> Vec> { + self.function_aggregator.get_all_functional_annotations(proteins) + } + } +} +pub mod suffix_to_protein_index { + use clap::ValueEnum; + use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; + use crate::Nullable; + /// Enum used to define the commandline arguments and choose which index style is used + pub enum SuffixToProteinMappingStyle { + Dense, + Sparse, + } + #[allow( + dead_code, + unreachable_code, + unused_variables, + unused_braces, + unused_qualifications, + )] + #[allow( + clippy::style, + clippy::complexity, + clippy::pedantic, + clippy::restriction, + clippy::perf, + clippy::deprecated, + clippy::nursery, + clippy::cargo, + clippy::suspicious_else_formatting, + clippy::almost_swapped, + clippy::redundant_locals, + )] + #[automatically_derived] + impl clap::ValueEnum for SuffixToProteinMappingStyle { + fn value_variants<'a>() -> &'a [Self] { + &[Self::Dense, Self::Sparse] + } + fn to_possible_value<'a>( + &self, + ) -> ::std::option::Option { + match self { + Self::Dense => Some({ clap::builder::PossibleValue::new("dense") }), + Self::Sparse => Some({ clap::builder::PossibleValue::new("sparse") }), + _ => None, + } + } + } + #[automatically_derived] + impl ::core::clone::Clone for SuffixToProteinMappingStyle { + #[inline] + fn clone(&self) -> SuffixToProteinMappingStyle { + match self { + SuffixToProteinMappingStyle::Dense => SuffixToProteinMappingStyle::Dense, + SuffixToProteinMappingStyle::Sparse => { + SuffixToProteinMappingStyle::Sparse + } + } + } + } + #[automatically_derived] + impl ::core::fmt::Debug for SuffixToProteinMappingStyle { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::write_str( + f, + match self { + SuffixToProteinMappingStyle::Dense => "Dense", + SuffixToProteinMappingStyle::Sparse => "Sparse", + }, + ) + } + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for SuffixToProteinMappingStyle {} + #[automatically_derived] + impl ::core::cmp::PartialEq for SuffixToProteinMappingStyle { + #[inline] + fn eq(&self, other: &SuffixToProteinMappingStyle) -> bool { + let __self_tag = ::core::intrinsics::discriminant_value(self); + let __arg1_tag = ::core::intrinsics::discriminant_value(other); + __self_tag == __arg1_tag + } + } + /// Trait implemented by the SuffixToProtein mappings + pub trait SuffixToProteinIndex: Send + Sync { + /// Returns the index of the protein in the protein list for the given suffix + /// + /// # Arguments + /// * `suffix` - The suffix of which we want to know of which protein it is a part + /// + /// # Returns + /// + /// Returns the index of the protein in the proteins list of which the suffix is a part + fn suffix_to_protein(&self, suffix: i64) -> u32; + } + /// Mapping that uses O(n) memory with n the size of the input text, but retrieval of the protein is + /// in O(1) + pub struct DenseSuffixToProtein { + mapping: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for DenseSuffixToProtein { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field1_finish( + f, + "DenseSuffixToProtein", + "mapping", + &&self.mapping, + ) + } + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for DenseSuffixToProtein {} + #[automatically_derived] + impl ::core::cmp::PartialEq for DenseSuffixToProtein { + #[inline] + fn eq(&self, other: &DenseSuffixToProtein) -> bool { + self.mapping == other.mapping + } + } + /// Mapping that uses O(m) memory with m the number of proteins, but retrieval of the protein is + /// O(log m) + pub struct SparseSuffixToProtein { + mapping: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for SparseSuffixToProtein { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field1_finish( + f, + "SparseSuffixToProtein", + "mapping", + &&self.mapping, + ) + } + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for SparseSuffixToProtein {} + #[automatically_derived] + impl ::core::cmp::PartialEq for SparseSuffixToProtein { + #[inline] + fn eq(&self, other: &SparseSuffixToProtein) -> bool { + self.mapping == other.mapping + } + } + impl SuffixToProteinIndex for DenseSuffixToProtein { + fn suffix_to_protein(&self, suffix: i64) -> u32 { + self.mapping[suffix as usize] + } + } + impl SuffixToProteinIndex for SparseSuffixToProtein { + fn suffix_to_protein(&self, suffix: i64) -> u32 { + let protein_index = self + .mapping + .binary_search(&suffix) + .unwrap_or_else(|index| index - 1); + if self.mapping[protein_index + 1] == suffix + 1 { + return u32::NULL; + } + protein_index as u32 + } + } + impl DenseSuffixToProtein { + /// Creates a new DenseSuffixToProtein mapping + /// + /// # Arguments + /// * `text` - The text over which we want to create the mapping + /// + /// # Returns + /// + /// Returns a new DenseSuffixToProtein build over the provided text + pub fn new(text: &[u8]) -> Self { + let mut current_protein_index: u32 = 0; + let mut suffix_index_to_protein: Vec = ::alloc::vec::Vec::new(); + for &char in text.iter() { + if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { + current_protein_index += 1; + suffix_index_to_protein.push(u32::NULL); + } else { + match (¤t_protein_index, &u32::NULL) { + (left_val, right_val) => { + if *left_val == *right_val { + let kind = ::core::panicking::AssertKind::Ne; + ::core::panicking::assert_failed( + kind, + &*left_val, + &*right_val, + ::core::option::Option::None, + ); + } + } + }; + suffix_index_to_protein.push(current_protein_index); + } + } + suffix_index_to_protein.shrink_to_fit(); + DenseSuffixToProtein { + mapping: suffix_index_to_protein, + } + } + } + impl SparseSuffixToProtein { + /// Creates a new SparseSuffixToProtein mapping + /// + /// # Arguments + /// * `text` - The text over which we want to create the mapping + /// + /// # Returns + /// + /// Returns a new SparseSuffixToProtein build over the provided text + pub fn new(text: &[u8]) -> Self { + let mut suffix_index_to_protein: Vec = <[_]>::into_vec( + #[rustc_box] + ::alloc::boxed::Box::new([0]), + ); + for (index, &char) in text.iter().enumerate() { + if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { + suffix_index_to_protein.push(index as i64 + 1); + } + } + suffix_index_to_protein.shrink_to_fit(); + SparseSuffixToProtein { + mapping: suffix_index_to_protein, + } + } + } +} +/// Represents a suffix array. +pub enum SuffixArray { + /// The original suffix array. + Original(Vec), + /// The compressed suffix array. + Compressed(BitArray), +} +impl SuffixArray { + /// Returns the length of the suffix array. + /// + /// # Returns + /// + /// The length of the suffix array. + pub fn len(&self) -> usize { + match self { + SuffixArray::Original(sa) => sa.len(), + SuffixArray::Compressed(sa) => sa.len(), + } + } + /// Returns the suffix array at the given index. + /// + /// # Arguments + /// + /// * `index` - The index of the suffix array. + /// + /// # Returns + /// + /// The suffix array at the given index. + pub fn get(&self, index: usize) -> i64 { + match self { + SuffixArray::Original(sa) => sa[index], + SuffixArray::Compressed(sa) => sa.get(index) as i64, + } + } + /// Returns whether the suffix array is empty. + /// + /// # Returns + /// + /// True if the suffix array is empty, false otherwise. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} +/// Custom trait implemented by types that have a value that represents NULL +pub trait Nullable { + const NULL: T; + /// Returns whether the value is NULL. + /// + /// # Returns + /// + /// True if the value is NULL, false otherwise. + fn is_null(&self) -> bool; +} +/// Implementation of the `Nullable` trait for the `u32` type. +impl Nullable for u32 { + const NULL: u32 = u32::MAX; + fn is_null(&self) -> bool { + *self == Self::NULL + } +} +pub(crate) use define_struct; diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 8e2c0dd..08a535c 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -84,6 +84,7 @@ pub fn search_proteins_for_peptide<'a>( } SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, SearchAllSuffixesResult::NoMatches => { + eprintln!("No matches found for peptide: {}", peptide); return None; } }; @@ -277,3 +278,72 @@ pub fn search_all_peptides( result: res } } + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_json_eq(generated_json: &str, expected_json: &str) { + assert_eq!( + generated_json.parse::().unwrap(), + expected_json.parse::().unwrap(), + ); + } + + #[test] + fn test_serialize_output_data() { + let output_data = OutputData { + result: vec![ 1, 2, 3 ] + }; + + let generated_json = serde_json::to_string(&output_data).unwrap(); + let expected_json = "{\"result\":[1,2,3]}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_search_result_with_analysis() { + let search_result = SearchResultWithAnalysis { + sequence: "MSKIAALLPSV".to_string(), + lca: Some(1), + taxa: vec![1, 2, 3], + uniprot_accession_numbers: vec!["P12345".to_string(), "P23456".to_string()], + fa: None, + cutoff_used: true + }; + + let generated_json = serde_json::to_string(&search_result).unwrap(); + let expected_json = "{\"sequence\":\"MSKIAALLPSV\",\"lca\":1,\"taxa\":[1,2,3],\"uniprot_accession_numbers\":[\"P12345\",\"P23456\"],\"fa\":null,\"cutoff_used\":true}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_protein_info() { + let protein_info = ProteinInfo { + taxon: 1, + uniprot_accession: "P12345".to_string(), + functional_annotations: vec!["GO:0001234".to_string(), "GO:0005678".to_string()] + }; + + let generated_json = serde_json::to_string(&protein_info).unwrap(); + let expected_json = "{\"taxon\":1,\"uniprot_accession\":\"P12345\",\"functional_annotations\":[\"GO:0001234\",\"GO:0005678\"]}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_search_only_result() { + let search_result = SearchOnlyResult { + sequence: "MSKIAALLPSV".to_string(), + proteins: vec![], + cutoff_used: true + }; + + let generated_json = serde_json::to_string(&search_result).unwrap(); + let expected_json = "{\"sequence\":\"MSKIAALLPSV\",\"proteins\":[],\"cutoff_used\":true}"; + + assert_json_eq(&generated_json, expected_json); + } +} diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 5bf924b..d5ab080 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -17,10 +17,7 @@ use crate::{ sa_searcher::BoundSearch::{ Maximum, Minimum - }, - suffix_to_protein_index::SuffixToProteinIndex, - Nullable, - SuffixArray + }, suffix_to_protein_index::SuffixToProteinIndex, Nullable, SuffixArray }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -101,12 +98,12 @@ impl PartialEq for SearchAllSuffixesResult { /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate /// the functional analysis provided by Unipept pub struct Searcher { - sa: SuffixArray, + pub sa: SuffixArray, pub sparseness_factor: u8, - suffix_index_to_protein: Box, - proteins: Proteins, - taxon_id_calculator: TaxonAggregator, - function_aggregator: FunctionAggregator + pub suffix_index_to_protein: Box, + pub proteins: Proteins, + pub taxon_id_calculator: TaxonAggregator, + pub function_aggregator: FunctionAggregator } impl Searcher { @@ -548,3 +545,28 @@ impl Searcher { .get_all_functional_annotations(proteins) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_partial_eq_search_all_suffixes_result() { + let search_all_suffixes_result_1 = SearchAllSuffixesResult::SearchResult(vec![1, 2, 3]); + let search_all_suffixes_result_2 = SearchAllSuffixesResult::SearchResult(vec![3, 2, 1]); + let search_all_suffixes_result_3 = SearchAllSuffixesResult::SearchResult(vec![1, 2, 4]); + let search_all_suffixes_result_4 = SearchAllSuffixesResult::MaxMatches(vec![1, 2, 3]); + let search_all_suffixes_result_5 = SearchAllSuffixesResult::MaxMatches(vec![3, 2, 1]); + let search_all_suffixes_result_6 = SearchAllSuffixesResult::MaxMatches(vec![1, 2, 4]); + let search_all_suffixes_result_7 = SearchAllSuffixesResult::NoMatches; + let search_all_suffixes_result_8 = SearchAllSuffixesResult::NoMatches; + + assert_eq!(search_all_suffixes_result_1, search_all_suffixes_result_2); + assert_ne!(search_all_suffixes_result_1, search_all_suffixes_result_3); + assert_eq!(search_all_suffixes_result_4, search_all_suffixes_result_5); + assert_ne!(search_all_suffixes_result_4, search_all_suffixes_result_6); + assert_eq!(search_all_suffixes_result_7, search_all_suffixes_result_8); + assert_ne!(search_all_suffixes_result_1, search_all_suffixes_result_7); + assert_ne!(search_all_suffixes_result_4, search_all_suffixes_result_7); + } +} diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs index a1aabb7..09c330b 100644 --- a/sa-mappings/src/taxonomy.rs +++ b/sa-mappings/src/taxonomy.rs @@ -14,10 +14,7 @@ use umgap::{ mix::MixCalculator }, taxon::{ - read_taxa_file, - TaxonId, - TaxonList, - TaxonTree + read_taxa_file, Taxon, TaxonId, TaxonList, TaxonTree } }; @@ -25,10 +22,8 @@ use umgap::{ pub struct TaxonAggregator { /// A vector that contains the snapped taxon IDs. snapping: Vec>, - /// The aggregator used to aggregate taxon IDs. aggregator: Box, - /// The taxon list. taxon_list: TaxonList } @@ -43,6 +38,33 @@ pub enum AggregationMethod { } impl TaxonAggregator { + /// Creates a new `TaxonAggregator` with the given taxa and aggregation method. + /// + /// # Arguments + /// + /// * `taxa` - A vector of `Taxon` objects representing the taxa. + /// * `method` - An `AggregationMethod` enum specifying the aggregation method to use. + /// + /// # Returns + /// + /// Returns a new `TaxonAggregator` instance. + pub fn new(taxa: Vec, method: AggregationMethod) -> Self { + let taxon_tree = TaxonTree::new(&taxa); + let taxon_list = TaxonList::new(taxa); + let snapping = taxon_tree.snapping(&taxon_list, true); + + let aggregator: Box = match method { + AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), + AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) + }; + + Self { + snapping, + aggregator, + taxon_list + } + } + /// Creates a new `TaxonAggregator` from a taxonomy file and an aggregation method. /// /// # Arguments @@ -62,20 +84,7 @@ impl TaxonAggregator { method: AggregationMethod ) -> Result> { let taxons = read_taxa_file(file)?; - let taxon_tree = TaxonTree::new(&taxons); - let taxon_list = TaxonList::new(taxons); - let snapping = taxon_tree.snapping(&taxon_list, true); - - let aggregator: Box = match method { - AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), - AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) - }; - - Ok(Self { - snapping, - aggregator, - taxon_list - }) + Ok(Self::new(taxons, method)) } /// Checks if a taxon exists in the taxon list. @@ -157,6 +166,7 @@ mod tests { }; use tempdir::TempDir; + use umgap::rank::Rank; use super::*; @@ -183,6 +193,30 @@ mod tests { taxonomy_file } + #[test] + fn test_new() { + TaxonAggregator::new( + vec![ + Taxon::new(1, "root".to_string(), Rank::NoRank, 1, true), + Taxon::new(2, "Bacteria".to_string(), Rank::Superkingdom, 1, true), + Taxon::new(6, "Azorhizobium".to_string(), Rank::Genus, 1, true), + Taxon::new(7, "Azorhizobium caulinodans".to_string(), Rank::Species, 6, true), + Taxon::new(9, "Buchnera aphidicola".to_string(), Rank::Species, 6, true), + Taxon::new(10, "Cellvibrio".to_string(), Rank::Genus, 6, true), + Taxon::new(11, "Cellulomonas gilvus".to_string(), Rank::Species, 10, true), + Taxon::new(13, "Dictyoglomus".to_string(), Rank::Genus, 11, true), + Taxon::new(14, "Dictyoglomus thermophilum".to_string(), Rank::Species, 10, true), + Taxon::new(16, "Methylophilus".to_string(), Rank::Genus, 14, true), + Taxon::new(17, "Methylophilus methylotrophus".to_string(), Rank::Species, 16, true), + Taxon::new(18, "Pelobacter".to_string(), Rank::Genus, 17, true), + Taxon::new(19, "Syntrophotalea carbinolica".to_string(), Rank::Species, 17, true), + Taxon::new(20, "Phenylobacterium".to_string(), Rank::Genus, 19, true), + Taxon::new(21, "Invalid".to_string(), Rank::Species, 19, false) + ], + AggregationMethod::Lca + ); + } + #[test] fn test_try_from_taxonomy_file() { // Create a temporary directory for this test From 7c2b6dcb494b0a3cd3c84b1b96d4b3d380324f9d Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 22 May 2024 13:35:03 +0200 Subject: [PATCH 18/26] update codecov config --- codecov.yml | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/codecov.yml b/codecov.yml index 4056f16..5e01697 100644 --- a/codecov.yml +++ b/codecov.yml @@ -16,34 +16,13 @@ coverage: flags: - sa-compression sa-index: - target: 65% - flags: - - sa-index - sa-mappings: - flags: - - sa-mappings - patch: - default: - target: 90% - bitarray: - flags: - - bitarray - fa-compression: - flags: - - fa-compression - sa-builder: - flags: - - sa-builder - sa-compression: - flags: - - sa-compression - sa-index: - target: 65% + target: 80% flags: - sa-index sa-mappings: flags: - sa-mappings + patch: off flags: bitarray: From 263d6972e9e57dc723ad03102c680a9c7fbc2855 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 22 May 2024 13:50:06 +0200 Subject: [PATCH 19/26] ignore main files when covering code --- codecov.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/codecov.yml b/codecov.yml index 5e01697..0a92f62 100644 --- a/codecov.yml +++ b/codecov.yml @@ -49,3 +49,6 @@ flags: paths: - sa-mappings carryforward: true + +ignore: + - "**/main.rs" # Ignore main.rs files From 1091ff21638421ffa351b006d83d8703b7f4e01c Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 22 May 2024 13:57:59 +0200 Subject: [PATCH 20/26] test failing reader and writer in test module --- sa-compression/src/lib.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index d4da1eb..b7bac78 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -247,4 +247,24 @@ mod tests { load_compressed_suffix_array(&mut reader, 8).unwrap(); } + + #[test] + fn test_failing_writer() { + let mut writer = FailingWriter { + valid_write_count: 0 + }; + assert!(writer.flush().is_ok()); + assert!(writer.write(&[0]).is_err()); + } + + #[test] + fn test_failing_reader() { + let mut reader = FailingReader { + valid_read_count: 0 + }; + assert_eq!(reader.fill_buf().unwrap(), &[]); + assert_eq!(reader.consume(0), ()); + let mut buffer = [0_u8; 1]; + assert!(reader.read(&mut buffer).is_err()); + } } From c15e7201ba7e4ad6ac01610e62c17eaef8f3a105 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 22 May 2024 14:39:45 +0200 Subject: [PATCH 21/26] small cleanup --- sa-builder/README.md | 28 ++++++++++++++++++++++++++++ sa-builder/src/lib.rs | 12 ++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 sa-builder/README.md diff --git a/sa-builder/README.md b/sa-builder/README.md new file mode 100644 index 0000000..5f51b07 --- /dev/null +++ b/sa-builder/README.md @@ -0,0 +1,28 @@ +Suffix Array Builder +==================== + +A rust implementation to build large generalized suffix arrays. + +# Usage + +```plain +Build a (sparse, compressed) suffix array from the given text + +Usage: sa-builder [OPTIONS] --database-file --taxonomy --output + +Options: + -d, --database-file + File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a hashtag `#` + -t, --taxonomy + The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy + -o, --output + Output location where to store the suffix array + -s, --sparseness-factor + The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used) [default: 1] + -a, --construction-algorithm + The algorithm used to construct the suffix array (default value LibSais) [default: lib-sais] [possible values: lib-div-suf-sort, lib-sais] + -c, --compress-sa + If the suffix array should be compressed (default value true) + -h, --help + Print help +``` diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 57a4cfa..98ac63d 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -5,28 +5,28 @@ use clap::{ ValueEnum }; -/// Enum that represents all possible commandline arguments +/// Build a (sparse, compressed) suffix array from the given text #[derive(Parser, Debug)] pub struct Arguments { /// File with the proteins used to build the suffix tree. All the proteins are expected to be - /// concatenated using a `#`. + /// concatenated using a hashtag `#`. #[arg(short, long)] pub database_file: String, /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. #[arg(short, long)] pub taxonomy: String, - /// Output file to store the built index. + /// Output location where to store the suffix array #[arg(short, long)] pub output: String, /// The sparseness_factor used on the suffix array (default value 1, which means every value in /// the SA is used) - #[arg(long, default_value_t = 1)] + #[arg(short, long, default_value_t = 1)] pub sparseness_factor: u8, /// The algorithm used to construct the suffix array (default value LibSais) - #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] + #[arg(short('a'), long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] pub construction_algorithm: SAConstructionAlgorithm, /// If the suffix array should be compressed (default value true) - #[arg(long, default_value_t = false)] + #[arg(short, long, default_value_t = false)] pub compress_sa: bool } From 010f80b79ea5841fcf32968ec600934a6e330759 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 22 May 2024 14:39:52 +0200 Subject: [PATCH 22/26] small cleanup --- README.md | 10 +- sa-index/out.txt | 1515 ---------------------------------------------- 2 files changed, 6 insertions(+), 1519 deletions(-) delete mode 100644 sa-index/out.txt diff --git a/README.md b/README.md index 9a4607c..15f8897 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,20 @@ ![Codecov](https://img.shields.io/codecov/c/github/unipept/unipept-index?token=IZ75A2FY98&logo=Codecov) -The unipept index written entirely in `Rust`. This repository consists of multiple different Rust projects that depend on -each other. More information about each project can be found in their respective `README.md` file. +The unipept index written entirely in `Rust`. This repository consists of multiple different Rust projects that depend on each other. More information about each project can be found in their respective `README.md` file. ## Installation -Clone this repository with the following command: +> [!NOTE] +> To build and use the Unipept Index, you need to have Rust installed. If you don't have Rust installed, you can get it from [rust-lang.org](https://www.rust-lang.org/). + +Clone this repository by executing the following command: ```bash git clone https://github.com/unipept/unipept-index.git ``` -And build the projects using: +Next, build everything by executing the following command in the root of the repository. ```bash cargo build --release diff --git a/sa-index/out.txt b/sa-index/out.txt deleted file mode 100644 index 0bd0be1..0000000 --- a/sa-index/out.txt +++ /dev/null @@ -1,1515 +0,0 @@ -#![feature(prelude_import)] -#[prelude_import] -use std::prelude::rust_2021::*; -#[macro_use] -extern crate std; -use bitarray::BitArray; -pub mod binary { - use std::{error::Error, io::{BufRead, Read, Write}}; - /// The `Binary` trait provides methods for reading and writing a struct as binary. - pub trait Binary { - /// Writes the struct as binary to the given writer. - /// - /// # Arguments - /// - /// * `writer` - The writer to write the binary data to. - /// - /// # Returns - /// - /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. - fn write_binary(&self, writer: &mut W) -> std::io::Result<()>; - /// Reads binary data into a struct from the given reader. - /// - /// # Arguments - /// - /// * `reader` - The reader to read the binary data from. - /// - /// # Returns - /// - /// Returns `Ok(())` if the read operation is successful, or an `Err` if an error occurs. - fn read_binary(&mut self, reader: R) -> std::io::Result<()>; - } - /// Implements the `Binary` trait for `Vec`. - impl Binary for Vec { - /// Writes the elements of the vector to a binary file. - /// - /// # Arguments - /// - /// * `writer` - The writer to which the binary data will be written. - /// - /// # Returns - /// - /// Returns `Ok(())` if the write operation is successful, or an `std::io::Error` otherwise. - fn write_binary(&self, writer: &mut W) -> std::io::Result<()> { - for value in self { - writer.write_all(&value.to_le_bytes())?; - } - Ok(()) - } - /// Reads binary data from a reader and populates the vector with the read values. - /// - /// # Arguments - /// - /// * `reader` - The reader from which the binary data will be read. - /// - /// # Returns - /// - /// Returns `Ok(())` if the read operation is successful, or an `std::io::Error` otherwise. - fn read_binary(&mut self, mut reader: R) -> std::io::Result<()> { - self.clear(); - let mut buffer = ::alloc::vec::from_elem(0, 8 * 1024); - loop { - let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; - for buffer_slice in buffer[..bytes_read].chunks_exact(8) { - self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); - } - if finished { - break; - } - } - Ok(()) - } - } - /// Writes the suffix array to a binary file. - /// - /// # Arguments - /// - /// * `sa` - The suffix array to dump. - /// * `sparseness_factor` - The sparseness factor to write to the file. - /// * `writer` - The writer to write the binary data to. - /// - /// # Returns - /// - /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. - pub fn dump_suffix_array( - sa: &Vec, - sparseness_factor: u8, - writer: &mut impl Write, - ) -> Result<(), Box> { - writer - .write(&[64_u8]) - .map_err(|_| "Could not write the required bits to the writer")?; - writer - .write(&[sparseness_factor]) - .map_err(|_| "Could not write the sparseness factor to the writer")?; - let sa_len = sa.len(); - writer - .write(&(sa_len).to_le_bytes()) - .map_err(|_| "Could not write the size of the suffix array to the writer")?; - sa.write_binary(writer) - .map_err(|_| "Could not write the suffix array to the writer")?; - Ok(()) - } - /// Loads the suffix array from the file with the given `filename` - /// - /// # Arguments - /// * `filename` - The filename of the file where the suffix array is stored - /// - /// # Returns - /// - /// Returns the sample rate of the suffix array, together with the suffix array - /// - /// # Errors - /// - /// Returns any error from opening the file or reading the file - pub fn load_suffix_array( - reader: &mut impl BufRead, - ) -> Result<(u8, Vec), Box> { - let mut sample_rate_buffer = [0_u8; 1]; - reader - .read_exact(&mut sample_rate_buffer) - .map_err(|_| "Could not read the sample rate from the binary file")?; - let sample_rate = sample_rate_buffer[0]; - let mut size_buffer = [0_u8; 8]; - reader - .read_exact(&mut size_buffer) - .map_err(|_| { - "Could not read the size of the suffix array from the binary file" - })?; - let size = u64::from_le_bytes(size_buffer) as usize; - let mut sa = Vec::with_capacity(size); - sa.read_binary(reader) - .map_err(|_| "Could not read the suffix array from the binary file")?; - Ok((sample_rate, sa)) - } - /// Fills the buffer with data read from the input. - /// - /// # Arguments - /// - /// * `input` - The input source to read data from. - /// * `buffer` - The buffer to fill with data. - /// - /// # Returns - /// - /// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input - /// is reached, and `bytes_read` is the number of bytes read into the buffer. - fn fill_buffer( - input: &mut T, - buffer: &mut Vec, - ) -> std::io::Result<(bool, usize)> { - let buffer_size = buffer.len(); - let mut writable_buffer_space = buffer.as_mut(); - loop { - match input.read(writable_buffer_space) { - Ok(0) => { - return Ok(( - !writable_buffer_space.is_empty(), - buffer_size - writable_buffer_space.len(), - )); - } - Ok(bytes_read) => { - writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); - } - Err(e) => { - return Err(e); - } - } - } - } -} -pub mod peptide_search { - use rayon::prelude::*; - use sa_mappings::{functionality::FunctionalAggregation, proteins::Protein}; - use serde::Serialize; - use crate::sa_searcher::{SearchAllSuffixesResult, Searcher}; - /// Struct representing a collection of `SearchResultWithAnalysis` or `SearchOnlyResult` results - pub struct OutputData { - result: Vec, - } - #[automatically_derived] - impl ::core::fmt::Debug for OutputData { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_struct_field1_finish( - f, - "OutputData", - "result", - &&self.result, - ) - } - } - #[doc(hidden)] - #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] - const _: () = { - #[allow(unused_extern_crates, clippy::useless_attribute)] - extern crate serde as _serde; - #[automatically_derived] - impl _serde::Serialize for OutputData - where - T: _serde::Serialize, - { - fn serialize<__S>( - &self, - __serializer: __S, - ) -> _serde::__private::Result<__S::Ok, __S::Error> - where - __S: _serde::Serializer, - { - let mut __serde_state = _serde::Serializer::serialize_struct( - __serializer, - "OutputData", - false as usize + 1, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "result", - &self.result, - )?; - _serde::ser::SerializeStruct::end(__serde_state) - } - } - }; - /// Struct representing the search result of the `sequence` in the index, including the analyses - pub struct SearchResultWithAnalysis { - sequence: String, - lca: Option, - taxa: Vec, - uniprot_accession_numbers: Vec, - fa: Option, - cutoff_used: bool, - } - #[automatically_derived] - impl ::core::fmt::Debug for SearchResultWithAnalysis { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - let names: &'static _ = &[ - "sequence", - "lca", - "taxa", - "uniprot_accession_numbers", - "fa", - "cutoff_used", - ]; - let values: &[&dyn ::core::fmt::Debug] = &[ - &self.sequence, - &self.lca, - &self.taxa, - &self.uniprot_accession_numbers, - &self.fa, - &&self.cutoff_used, - ]; - ::core::fmt::Formatter::debug_struct_fields_finish( - f, - "SearchResultWithAnalysis", - names, - values, - ) - } - } - #[doc(hidden)] - #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] - const _: () = { - #[allow(unused_extern_crates, clippy::useless_attribute)] - extern crate serde as _serde; - #[automatically_derived] - impl _serde::Serialize for SearchResultWithAnalysis { - fn serialize<__S>( - &self, - __serializer: __S, - ) -> _serde::__private::Result<__S::Ok, __S::Error> - where - __S: _serde::Serializer, - { - let mut __serde_state = _serde::Serializer::serialize_struct( - __serializer, - "SearchResultWithAnalysis", - false as usize + 1 + 1 + 1 + 1 + 1 + 1, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "sequence", - &self.sequence, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "lca", - &self.lca, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "taxa", - &self.taxa, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "uniprot_accession_numbers", - &self.uniprot_accession_numbers, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "fa", - &self.fa, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "cutoff_used", - &self.cutoff_used, - )?; - _serde::ser::SerializeStruct::end(__serde_state) - } - } - }; - /// Struct representing the search result of the `sequence` in the index (without the analyses) - pub struct SearchOnlyResult { - sequence: String, - proteins: Vec, - cutoff_used: bool, - } - #[automatically_derived] - impl ::core::fmt::Debug for SearchOnlyResult { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_struct_field3_finish( - f, - "SearchOnlyResult", - "sequence", - &self.sequence, - "proteins", - &self.proteins, - "cutoff_used", - &&self.cutoff_used, - ) - } - } - #[doc(hidden)] - #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] - const _: () = { - #[allow(unused_extern_crates, clippy::useless_attribute)] - extern crate serde as _serde; - #[automatically_derived] - impl _serde::Serialize for SearchOnlyResult { - fn serialize<__S>( - &self, - __serializer: __S, - ) -> _serde::__private::Result<__S::Ok, __S::Error> - where - __S: _serde::Serializer, - { - let mut __serde_state = _serde::Serializer::serialize_struct( - __serializer, - "SearchOnlyResult", - false as usize + 1 + 1 + 1, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "sequence", - &self.sequence, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "proteins", - &self.proteins, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "cutoff_used", - &self.cutoff_used, - )?; - _serde::ser::SerializeStruct::end(__serde_state) - } - } - }; - /// Struct that represents all information known about a certain protein in our database - pub struct ProteinInfo { - taxon: usize, - uniprot_accession: String, - functional_annotations: Vec, - } - #[automatically_derived] - impl ::core::fmt::Debug for ProteinInfo { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_struct_field3_finish( - f, - "ProteinInfo", - "taxon", - &self.taxon, - "uniprot_accession", - &self.uniprot_accession, - "functional_annotations", - &&self.functional_annotations, - ) - } - } - #[doc(hidden)] - #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] - const _: () = { - #[allow(unused_extern_crates, clippy::useless_attribute)] - extern crate serde as _serde; - #[automatically_derived] - impl _serde::Serialize for ProteinInfo { - fn serialize<__S>( - &self, - __serializer: __S, - ) -> _serde::__private::Result<__S::Ok, __S::Error> - where - __S: _serde::Serializer, - { - let mut __serde_state = _serde::Serializer::serialize_struct( - __serializer, - "ProteinInfo", - false as usize + 1 + 1 + 1, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "taxon", - &self.taxon, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "uniprot_accession", - &self.uniprot_accession, - )?; - _serde::ser::SerializeStruct::serialize_field( - &mut __serde_state, - "functional_annotations", - &self.functional_annotations, - )?; - _serde::ser::SerializeStruct::end(__serde_state) - } - } - }; - /// Searches the `peptide` in the index multithreaded and retrieves the matching proteins - /// - /// # Arguments - /// * `searcher` - The Searcher which contains the protein database - /// * `peptide` - The peptide that is being searched in the index - /// * `cutoff` - The maximum amount of matches we want to process from the index - /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search - /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the - /// taxonomy - /// - /// # Returns - /// - /// Returns Some if matches are found. - /// The first argument is true if the cutoff is used, otherwise false - /// The second argument is a list of all matching proteins for the peptide - /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the - /// sparseness factor k used in the index - pub fn search_proteins_for_peptide<'a>( - searcher: &'a Searcher, - peptide: &str, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool, - ) -> Option<(bool, Vec<&'a Protein>)> { - let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); - if peptide.len() < searcher.sparseness_factor as usize { - return None; - } - let suffix_search = searcher - .search_matching_suffixes(peptide.as_bytes(), cutoff, equalize_i_and_l); - let mut cutoff_used = false; - let suffixes = match suffix_search { - SearchAllSuffixesResult::MaxMatches(matched_suffixes) => { - cutoff_used = true; - matched_suffixes - } - SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, - SearchAllSuffixesResult::NoMatches => { - return None; - } - }; - let mut proteins = searcher.retrieve_proteins(&suffixes); - if clean_taxa { - proteins.retain(|protein| searcher.taxon_valid(protein)) - } - Some((cutoff_used, proteins)) - } - /// Searches the `peptide` in the index multithreaded and retrieves the protein information from the - /// database This does NOT perform any of the analyses, it only retrieves the functional and - /// taxonomic annotations - /// - /// # Arguments - /// * `searcher` - The Searcher which contains the protein database - /// * `peptide` - The peptide that is being searched in the index - /// * `cutoff` - The maximum amount of matches we want to process from the index - /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search - /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the - /// taxonomy - /// - /// # Returns - /// - /// Returns Some(SearchOnlyResult) if the peptide has matches - /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the - /// sparseness factor k used in the index - pub fn search_peptide_retrieve_annotations( - searcher: &Searcher, - peptide: &str, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool, - ) -> Option { - let (cutoff_used, proteins) = search_proteins_for_peptide( - searcher, - peptide, - cutoff, - equalize_i_and_l, - clean_taxa, - )?; - let annotations = searcher.get_all_functional_annotations(&proteins); - let mut protein_info: Vec = ::alloc::vec::Vec::new(); - for (&protein, annotations) in proteins.iter().zip(annotations) { - protein_info - .push(ProteinInfo { - taxon: protein.taxon_id, - uniprot_accession: protein.uniprot_id.clone(), - functional_annotations: annotations, - }) - } - Some(SearchOnlyResult { - sequence: peptide.to_string(), - proteins: protein_info, - cutoff_used, - }) - } - /// Searches the `peptide` in the index multithreaded and performs the taxonomic and functional - /// analyses - /// - /// # Arguments - /// * `searcher` - The Searcher which contains the protein database - /// * `peptide` - The peptide that is being searched in the index - /// * `cutoff` - The maximum amount of matches we want to process from the index - /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search - /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the - /// taxonomy - /// - /// # Returns - /// - /// Returns Some(SearchResultWithAnalysis) if the peptide has matches - /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the - /// sparseness factor k used in the index - pub fn analyse_peptide( - searcher: &Searcher, - peptide: &str, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool, - ) -> Option { - let (cutoff_used, mut proteins) = search_proteins_for_peptide( - searcher, - peptide, - cutoff, - equalize_i_and_l, - clean_taxa, - )?; - if clean_taxa { - proteins.retain(|protein| searcher.taxon_valid(protein)) - } - let lca = if cutoff_used { Some(1) } else { searcher.retrieve_lca(&proteins) }; - lca?; - let mut uniprot_accession_numbers = ::alloc::vec::Vec::new(); - let mut taxa = ::alloc::vec::Vec::new(); - for protein in &proteins { - taxa.push(protein.taxon_id); - uniprot_accession_numbers.push(protein.uniprot_id.clone()); - } - let fa = searcher.retrieve_function(&proteins); - Some(SearchResultWithAnalysis { - sequence: peptide.to_string(), - lca, - cutoff_used, - uniprot_accession_numbers, - taxa, - fa, - }) - } - /// Searches the list of `peptides` in the index multithreaded and performs the functional and - /// taxonomic analyses - /// - /// # Arguments - /// * `searcher` - The Searcher which contains the protein database - /// * `peptides` - List of peptides we want to search in the index - /// * `cutoff` - The maximum amount of matches we want to process from the index - /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search - /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the - /// taxonomy - /// - /// # Returns - /// - /// Returns an `OutputData` object with the search and analyses results - /// for the peptides - pub fn analyse_all_peptides( - searcher: &Searcher, - peptides: &Vec, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool, - ) -> OutputData { - let res: Vec = peptides - .par_iter() - .map(|peptide| analyse_peptide( - searcher, - peptide, - cutoff, - equalize_i_and_l, - clean_taxa, - )) - .filter_map(|search_result| search_result) - .collect(); - OutputData { result: res } - } - /// Searches the list of `peptides` in the index and retrieves all related information about the - /// found proteins This does NOT perform any of the analyses - /// - /// # Arguments - /// * `searcher` - The Searcher which contains the protein database - /// * `peptides` - List of peptides we want to search in the index - /// * `cutoff` - The maximum amount of matches we want to process from the index - /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search - /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the - /// taxonomy - /// - /// # Returns - /// - /// Returns an `OutputData` object with the search results for the peptides - pub fn search_all_peptides( - searcher: &Searcher, - peptides: &Vec, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool, - ) -> OutputData { - let res: Vec = peptides - .par_iter() - .map(|peptide| { - search_peptide_retrieve_annotations( - searcher, - peptide, - cutoff, - equalize_i_and_l, - clean_taxa, - ) - }) - .filter_map(|search_result| search_result) - .collect(); - OutputData { result: res } - } -} -pub mod sa_searcher { - use std::cmp::min; - use sa_mappings::{ - functionality::{FunctionAggregator, FunctionalAggregation}, - proteins::{Protein, Proteins}, - taxonomy::TaxonAggregator, - }; - use umgap::taxon::TaxonId; - use crate::{ - define_struct, sa_searcher::BoundSearch::{Maximum, Minimum}, - suffix_to_protein_index::SuffixToProteinIndex, Nullable, SuffixArray, - }; - /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array - enum BoundSearch { - Minimum, - Maximum, - } - #[automatically_derived] - impl ::core::clone::Clone for BoundSearch { - #[inline] - fn clone(&self) -> BoundSearch { - *self - } - } - #[automatically_derived] - impl ::core::marker::Copy for BoundSearch {} - #[automatically_derived] - impl ::core::marker::StructuralPartialEq for BoundSearch {} - #[automatically_derived] - impl ::core::cmp::PartialEq for BoundSearch { - #[inline] - fn eq(&self, other: &BoundSearch) -> bool { - let __self_tag = ::core::intrinsics::discriminant_value(self); - let __arg1_tag = ::core::intrinsics::discriminant_value(other); - __self_tag == __arg1_tag - } - } - /// Enum representing the minimum and maximum bound of the found matches in the suffix array - pub enum BoundSearchResult { - NoMatches, - SearchResult((usize, usize)), - } - #[automatically_derived] - impl ::core::marker::StructuralPartialEq for BoundSearchResult {} - #[automatically_derived] - impl ::core::cmp::PartialEq for BoundSearchResult { - #[inline] - fn eq(&self, other: &BoundSearchResult) -> bool { - let __self_tag = ::core::intrinsics::discriminant_value(self); - let __arg1_tag = ::core::intrinsics::discriminant_value(other); - __self_tag == __arg1_tag - && match (self, other) { - ( - BoundSearchResult::SearchResult(__self_0), - BoundSearchResult::SearchResult(__arg1_0), - ) => *__self_0 == *__arg1_0, - _ => true, - } - } - } - #[automatically_derived] - impl ::core::fmt::Debug for BoundSearchResult { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - match self { - BoundSearchResult::NoMatches => { - ::core::fmt::Formatter::write_str(f, "NoMatches") - } - BoundSearchResult::SearchResult(__self_0) => { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "SearchResult", - &__self_0, - ) - } - } - } - } - /// Enum representing the matching suffixes after searching a peptide in the suffix array - /// Both the MaxMatches and SearchResult indicate found suffixes, but MaxMatches is used when the - /// cutoff is reached. - pub enum SearchAllSuffixesResult { - NoMatches, - MaxMatches(Vec), - SearchResult(Vec), - } - #[automatically_derived] - impl ::core::fmt::Debug for SearchAllSuffixesResult { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - match self { - SearchAllSuffixesResult::NoMatches => { - ::core::fmt::Formatter::write_str(f, "NoMatches") - } - SearchAllSuffixesResult::MaxMatches(__self_0) => { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "MaxMatches", - &__self_0, - ) - } - SearchAllSuffixesResult::SearchResult(__self_0) => { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "SearchResult", - &__self_0, - ) - } - } - } - } - /// Custom implementation of partialEq for SearchAllSuffixesResult - /// We consider 2 SearchAllSuffixesResult equal if they exist of the same key, and the Vec contains - /// the same values, but the order can be different - impl PartialEq for SearchAllSuffixesResult { - fn eq(&self, other: &Self) -> bool { - /// Returns true if `arr1` and `arr2` contains the same elements, the order of the elements - /// is ignored - /// - /// # Arguments - /// * `arr1` - The first array used in the comparison - /// * `arr2` - The second array used in the comparison - /// - /// # Returns - /// - /// Returns true if arr1 and arr2 contains the same elements, the order of the elements is - /// ignored - fn array_eq_unordered(arr1: &[i64], arr2: &[i64]) -> bool { - let mut arr1_copy = arr1.to_owned(); - let mut arr2_copy = arr2.to_owned(); - arr1_copy.sort(); - arr2_copy.sort(); - arr1_copy == arr2_copy - } - match (self, other) { - ( - SearchAllSuffixesResult::MaxMatches(arr1), - SearchAllSuffixesResult::MaxMatches(arr2), - ) => array_eq_unordered(arr1, arr2), - ( - SearchAllSuffixesResult::SearchResult(arr1), - SearchAllSuffixesResult::SearchResult(arr2), - ) => array_eq_unordered(arr1, arr2), - ( - SearchAllSuffixesResult::NoMatches, - SearchAllSuffixesResult::NoMatches, - ) => true, - _ => false, - } - } - } - /// Struct that contains all the elements needed to search a peptide in the suffix array - /// This struct also contains all the functions used for search - /// - /// # Arguments - /// * `sa` - The sparse suffix array representing the protein database - /// * `sparseness_factor` - The sparseness factor used by the suffix array - /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix is - /// part of - /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the - /// taxonomic analysis provided by Unipept - /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate - /// the functional analysis provided by Unipept - pub struct Searcher {} - impl Searcher { - /// Creates a new Searcher object - /// - /// # Arguments - /// * `sa` - The sparse suffix array representing the protein database - /// * `sparseness_factor` - The sparseness factor used by the suffix array - /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix - /// is part of - /// * `proteins` - List of all the proteins where the suffix array is build on - /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the - /// taxonomic analysis provided by Unipept - /// * `function_aggregator` - Object used to retrieve the functional annotations and to - /// calculate the functional analysis provided by Unipept - /// - /// # Returns - /// - /// Returns a new Searcher object - pub fn new( - sa: SuffixArray, - sparseness_factor: u8, - suffix_index_to_protein: Box, - proteins: Proteins, - taxon_id_calculator: TaxonAggregator, - function_aggregator: FunctionAggregator, - ) -> Self { - Self { - sa, - sparseness_factor, - suffix_index_to_protein, - proteins, - taxon_id_calculator, - function_aggregator, - } - } - /// Compares the `search_string` to the `suffix` - /// During search this function performs extra logic since the suffix array is build with I == - /// L, while ` self.proteins.input_string` is the original text where I != L - /// - /// # Arguments - /// * `search_string` - The string/peptide being searched in the suffix array - /// * `suffix` - The current suffix from the suffix array we are comparing with in the binary - /// search - /// * `skip` - How many characters we can skip in the comparison because we already know these - /// match - /// * `bound` - Indicates if we are searching for the min of max bound - /// - /// # Returns - /// - /// The first argument is true if `bound` == `Minimum` and `search_string` <= `suffix` or if - /// `bound` == `Maximum` and `search_string` >= `suffix` The second argument indicates how - /// far the `suffix` and `search_string` matched - fn compare( - &self, - search_string: &[u8], - suffix: i64, - skip: usize, - bound: BoundSearch, - ) -> (bool, usize) { - let mut index_in_suffix = (suffix as usize) + skip; - let mut index_in_search_string = skip; - let mut is_cond_or_equal = false; - let condition_check = match bound { - Minimum => |a: u8, b: u8| a < b, - Maximum => |a: u8, b: u8| a > b, - }; - while index_in_search_string < search_string.len() - && index_in_suffix < self.proteins.input_string.len() - && (search_string[index_in_search_string] - == self.proteins.input_string[index_in_suffix] - || (search_string[index_in_search_string] == b'L' - && self.proteins.input_string[index_in_suffix] == b'I') - || (search_string[index_in_search_string] == b'I' - && self.proteins.input_string[index_in_suffix] == b'L')) - { - index_in_suffix += 1; - index_in_search_string += 1; - } - if !search_string.is_empty() { - if index_in_search_string == search_string.len() { - is_cond_or_equal = true - } else if index_in_suffix < self.proteins.input_string.len() { - let peptide_char = if search_string[index_in_search_string] == b'L' { - b'I' - } else { - search_string[index_in_search_string] - }; - let protein_char = if self.proteins.input_string[index_in_suffix] - == b'L' - { - b'I' - } else { - self.proteins.input_string[index_in_suffix] - }; - is_cond_or_equal = condition_check(peptide_char, protein_char); - } - } - (is_cond_or_equal, index_in_search_string) - } - /// Searches for the minimum or maximum bound for a string in the suffix array - /// - /// # Arguments - /// * `bound` - Indicates if we are searching the minimum or maximum bound - /// * `search_string` - The string/peptide we are searching in the suffix array - /// - /// # Returns - /// - /// The first argument is true if a match was found - /// The second argument indicates the index of the minimum or maximum bound for the match - /// (depending on `bound`) - fn binary_search_bound( - &self, - bound: BoundSearch, - search_string: &[u8], - ) -> (bool, usize) { - let mut left: usize = 0; - let mut right: usize = self.sa.len(); - let mut lcp_left: usize = 0; - let mut lcp_right: usize = 0; - let mut found = false; - while right - left > 1 { - let center = (left + right) / 2; - let skip = min(lcp_left, lcp_right); - let (retval, lcp_center) = self - .compare(search_string, self.sa.get(center), skip, bound); - found |= lcp_center == search_string.len(); - if retval && bound == Minimum || !retval && bound == Maximum { - right = center; - lcp_right = lcp_center; - } else { - left = center; - lcp_left = lcp_center; - } - } - if right == 1 && left == 0 { - let (retval, lcp_center) = self - .compare( - search_string, - self.sa.get(0), - min(lcp_left, lcp_right), - bound, - ); - found |= lcp_center == search_string.len(); - if bound == Minimum && retval { - right = 0; - } - } - match bound { - Minimum => (found, right), - Maximum => (found, left), - } - } - /// Searches for the minimum and maximum bound for a string in the suffix array - /// - /// # Arguments - /// * `search_string` - The string/peptide we are searching in the suffix array - /// - /// # Returns - /// - /// Returns the minimum and maximum bound of all matches in the suffix array, or `NoMatches` if - /// no matches were found - pub fn search_bounds(&self, search_string: &[u8]) -> BoundSearchResult { - let (found_min, min_bound) = self - .binary_search_bound(Minimum, search_string); - if !found_min { - return BoundSearchResult::NoMatches; - } - let (_, max_bound) = self.binary_search_bound(Maximum, search_string); - BoundSearchResult::SearchResult((min_bound, max_bound + 1)) - } - /// Searches for the suffixes matching a search string - /// During search I and L can be equated - /// - /// # Arguments - /// * `search_string` - The string/peptide we are searching in the suffix array - /// * `max_matches` - The maximum amount of matches processed, if more matches are found we - /// don't process them - /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false - /// - /// # Returns - /// - /// Returns all the matching suffixes - #[inline] - pub fn search_matching_suffixes( - &self, - search_string: &[u8], - max_matches: usize, - equalize_i_and_l: bool, - ) -> SearchAllSuffixesResult { - let mut matching_suffixes: Vec = ::alloc::vec::Vec::new(); - let mut il_locations = ::alloc::vec::Vec::new(); - for (i, &character) in search_string.iter().enumerate() { - if character == b'I' || character == b'L' { - il_locations.push(i); - } - } - let mut skip: usize = 0; - while skip < self.sparseness_factor as usize { - let mut il_locations_start = 0; - while il_locations_start < il_locations.len() - && il_locations[il_locations_start] < skip - { - il_locations_start += 1; - } - let il_locations_current_suffix = &il_locations[il_locations_start..]; - let current_search_string_prefix = &search_string[..skip]; - let current_search_string_suffix = &search_string[skip..]; - let search_bound_result = self.search_bounds(&search_string[skip..]); - if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result { - let mut sa_index = min_bound; - while sa_index < max_bound { - let suffix = self.sa.get(sa_index) as usize; - if suffix >= skip - && ((skip == 0 - || Self::check_prefix( - current_search_string_prefix, - &self.proteins.input_string[suffix - skip..suffix], - equalize_i_and_l, - )) - && Self::check_suffix( - skip, - il_locations_current_suffix, - current_search_string_suffix, - &self - .proteins - .input_string[suffix..suffix + search_string.len() - skip], - equalize_i_and_l, - )) - { - matching_suffixes.push((suffix - skip) as i64); - if matching_suffixes.len() >= max_matches { - return SearchAllSuffixesResult::MaxMatches( - matching_suffixes, - ); - } - } - sa_index += 1; - } - } - skip += 1; - } - if matching_suffixes.is_empty() { - SearchAllSuffixesResult::NoMatches - } else { - SearchAllSuffixesResult::SearchResult(matching_suffixes) - } - } - /// Returns true of the prefixes are the same - /// if `equalize_i_and_l` is set to true, L and I are considered the same - /// - /// # Arguments - /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched - /// * `index_prefix` - The unchecked prefix from the protein from the suffix array - /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false - /// - /// # Returns - /// - /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise - /// false - #[inline] - fn check_prefix( - search_string_prefix: &[u8], - index_prefix: &[u8], - equalize_i_and_l: bool, - ) -> bool { - if equalize_i_and_l { - search_string_prefix - .iter() - .zip(index_prefix) - .all(|(&search_character, &index_character)| { - search_character == index_character - || (search_character == b'I' && index_character == b'L') - || (search_character == b'L' && index_character == b'I') - }) - } else { - search_string_prefix == index_prefix - } - } - /// Returns true of the search_string and index_string are equal - /// This is automatically true if `equalize_i_and_l` is set to true, since there matched during - /// search where I = L If `equalize_i_and_l` is set to false, we need to check if the I and - /// L locations have the same character - /// - /// # Arguments - /// * `skip` - The used skip factor during the search iteration - /// * `il_locations` - The locations of the I's and L's in the **original** peptide - /// * `search_string` - The peptide that is being searched, but already with the skipped prefix - /// removed from it - /// * `index_string` - The suffix that search_string matches with when I and L were equalized - /// during search - /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false - /// - /// # Returns - /// - /// Returns true if `search_string` and `index_string` are considered the same, otherwise false - fn check_suffix( - skip: usize, - il_locations: &[usize], - search_string: &[u8], - index_string: &[u8], - equalize_i_and_l: bool, - ) -> bool { - if equalize_i_and_l { - true - } else { - for &il_location in il_locations { - let index = il_location - skip; - if search_string[index] != index_string[index] { - return false; - } - } - true - } - } - /// Returns all the proteins that correspond with the provided suffixes - /// - /// # Arguments - /// * `suffixes` - List of suffix indices - /// - /// # Returns - /// - /// Returns the proteins that every suffix is a part of - #[inline] - pub fn retrieve_proteins(&self, suffixes: &Vec) -> Vec<&Protein> { - let mut res = ::alloc::vec::Vec::new(); - for &suffix in suffixes { - let protein_index = self - .suffix_index_to_protein - .suffix_to_protein(suffix); - if !protein_index.is_null() { - res.push(&self.proteins[protein_index as usize]); - } - } - res - } - /// Searches all the matching proteins for a search_string/peptide in the suffix array - /// - /// # Arguments - /// * `search_string` - The string/peptide being searched - /// * `equalize_i_and_l` - If set to true, I and L are equalized during search - /// - /// # Returns - /// - /// Returns the matching proteins for the search_string - pub fn search_proteins_for_peptide( - &self, - search_string: &[u8], - equalize_i_and_l: bool, - ) -> Vec<&Protein> { - let mut matching_suffixes = ::alloc::vec::Vec::new(); - if let SearchAllSuffixesResult::SearchResult(suffixes) = self - .search_matching_suffixes(search_string, usize::MAX, equalize_i_and_l) - { - matching_suffixes = suffixes; - } - self.retrieve_proteins(&matching_suffixes) - } - /// Retrieves the taxonomic analysis for a collection of proteins - /// - /// # Arguments - /// * `proteins` - A collection of proteins - /// - /// # Returns - /// - /// Returns the taxonomic analysis result for the given list of proteins - #[inline] - pub fn retrieve_lca(&self, proteins: &[&Protein]) -> Option { - let taxon_ids: Vec = proteins - .iter() - .map(|prot| prot.taxon_id) - .collect(); - self.taxon_id_calculator - .aggregate(taxon_ids) - .map(|id| self.taxon_id_calculator.snap_taxon(id)) - } - /// Returns true if the protein is considered valid by the provided taxonomy file - /// - /// # Arguments - /// * `protein` - A protein of which we want to check the validity - /// - /// # Returns - /// - /// Returns true if the protein is considered valid by the provided taxonomy file - pub fn taxon_valid(&self, protein: &Protein) -> bool { - self.taxon_id_calculator.taxon_valid(protein.taxon_id) - } - /// Retrieves the functional analysis for a collection of proteins - /// - /// # Arguments - /// * `proteins` - A collection of proteins - /// - /// # Returns - /// - /// Returns the functional analysis result for the given list of proteins - pub fn retrieve_function( - &self, - proteins: &[&Protein], - ) -> Option { - let res = self.function_aggregator.aggregate(proteins.to_vec()); - Some(res) - } - /// Retrieves the all the functional annotations for a collection of proteins - /// - /// # Arguments - /// * `proteins` - A collection of proteins - /// - /// # Returns - /// - /// Returns all functional annotations for a collection of proteins - pub fn get_all_functional_annotations( - &self, - proteins: &[&Protein], - ) -> Vec> { - self.function_aggregator.get_all_functional_annotations(proteins) - } - } -} -pub mod suffix_to_protein_index { - use clap::ValueEnum; - use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; - use crate::Nullable; - /// Enum used to define the commandline arguments and choose which index style is used - pub enum SuffixToProteinMappingStyle { - Dense, - Sparse, - } - #[allow( - dead_code, - unreachable_code, - unused_variables, - unused_braces, - unused_qualifications, - )] - #[allow( - clippy::style, - clippy::complexity, - clippy::pedantic, - clippy::restriction, - clippy::perf, - clippy::deprecated, - clippy::nursery, - clippy::cargo, - clippy::suspicious_else_formatting, - clippy::almost_swapped, - clippy::redundant_locals, - )] - #[automatically_derived] - impl clap::ValueEnum for SuffixToProteinMappingStyle { - fn value_variants<'a>() -> &'a [Self] { - &[Self::Dense, Self::Sparse] - } - fn to_possible_value<'a>( - &self, - ) -> ::std::option::Option { - match self { - Self::Dense => Some({ clap::builder::PossibleValue::new("dense") }), - Self::Sparse => Some({ clap::builder::PossibleValue::new("sparse") }), - _ => None, - } - } - } - #[automatically_derived] - impl ::core::clone::Clone for SuffixToProteinMappingStyle { - #[inline] - fn clone(&self) -> SuffixToProteinMappingStyle { - match self { - SuffixToProteinMappingStyle::Dense => SuffixToProteinMappingStyle::Dense, - SuffixToProteinMappingStyle::Sparse => { - SuffixToProteinMappingStyle::Sparse - } - } - } - } - #[automatically_derived] - impl ::core::fmt::Debug for SuffixToProteinMappingStyle { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::write_str( - f, - match self { - SuffixToProteinMappingStyle::Dense => "Dense", - SuffixToProteinMappingStyle::Sparse => "Sparse", - }, - ) - } - } - #[automatically_derived] - impl ::core::marker::StructuralPartialEq for SuffixToProteinMappingStyle {} - #[automatically_derived] - impl ::core::cmp::PartialEq for SuffixToProteinMappingStyle { - #[inline] - fn eq(&self, other: &SuffixToProteinMappingStyle) -> bool { - let __self_tag = ::core::intrinsics::discriminant_value(self); - let __arg1_tag = ::core::intrinsics::discriminant_value(other); - __self_tag == __arg1_tag - } - } - /// Trait implemented by the SuffixToProtein mappings - pub trait SuffixToProteinIndex: Send + Sync { - /// Returns the index of the protein in the protein list for the given suffix - /// - /// # Arguments - /// * `suffix` - The suffix of which we want to know of which protein it is a part - /// - /// # Returns - /// - /// Returns the index of the protein in the proteins list of which the suffix is a part - fn suffix_to_protein(&self, suffix: i64) -> u32; - } - /// Mapping that uses O(n) memory with n the size of the input text, but retrieval of the protein is - /// in O(1) - pub struct DenseSuffixToProtein { - mapping: Vec, - } - #[automatically_derived] - impl ::core::fmt::Debug for DenseSuffixToProtein { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_struct_field1_finish( - f, - "DenseSuffixToProtein", - "mapping", - &&self.mapping, - ) - } - } - #[automatically_derived] - impl ::core::marker::StructuralPartialEq for DenseSuffixToProtein {} - #[automatically_derived] - impl ::core::cmp::PartialEq for DenseSuffixToProtein { - #[inline] - fn eq(&self, other: &DenseSuffixToProtein) -> bool { - self.mapping == other.mapping - } - } - /// Mapping that uses O(m) memory with m the number of proteins, but retrieval of the protein is - /// O(log m) - pub struct SparseSuffixToProtein { - mapping: Vec, - } - #[automatically_derived] - impl ::core::fmt::Debug for SparseSuffixToProtein { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_struct_field1_finish( - f, - "SparseSuffixToProtein", - "mapping", - &&self.mapping, - ) - } - } - #[automatically_derived] - impl ::core::marker::StructuralPartialEq for SparseSuffixToProtein {} - #[automatically_derived] - impl ::core::cmp::PartialEq for SparseSuffixToProtein { - #[inline] - fn eq(&self, other: &SparseSuffixToProtein) -> bool { - self.mapping == other.mapping - } - } - impl SuffixToProteinIndex for DenseSuffixToProtein { - fn suffix_to_protein(&self, suffix: i64) -> u32 { - self.mapping[suffix as usize] - } - } - impl SuffixToProteinIndex for SparseSuffixToProtein { - fn suffix_to_protein(&self, suffix: i64) -> u32 { - let protein_index = self - .mapping - .binary_search(&suffix) - .unwrap_or_else(|index| index - 1); - if self.mapping[protein_index + 1] == suffix + 1 { - return u32::NULL; - } - protein_index as u32 - } - } - impl DenseSuffixToProtein { - /// Creates a new DenseSuffixToProtein mapping - /// - /// # Arguments - /// * `text` - The text over which we want to create the mapping - /// - /// # Returns - /// - /// Returns a new DenseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { - let mut current_protein_index: u32 = 0; - let mut suffix_index_to_protein: Vec = ::alloc::vec::Vec::new(); - for &char in text.iter() { - if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { - current_protein_index += 1; - suffix_index_to_protein.push(u32::NULL); - } else { - match (¤t_protein_index, &u32::NULL) { - (left_val, right_val) => { - if *left_val == *right_val { - let kind = ::core::panicking::AssertKind::Ne; - ::core::panicking::assert_failed( - kind, - &*left_val, - &*right_val, - ::core::option::Option::None, - ); - } - } - }; - suffix_index_to_protein.push(current_protein_index); - } - } - suffix_index_to_protein.shrink_to_fit(); - DenseSuffixToProtein { - mapping: suffix_index_to_protein, - } - } - } - impl SparseSuffixToProtein { - /// Creates a new SparseSuffixToProtein mapping - /// - /// # Arguments - /// * `text` - The text over which we want to create the mapping - /// - /// # Returns - /// - /// Returns a new SparseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { - let mut suffix_index_to_protein: Vec = <[_]>::into_vec( - #[rustc_box] - ::alloc::boxed::Box::new([0]), - ); - for (index, &char) in text.iter().enumerate() { - if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { - suffix_index_to_protein.push(index as i64 + 1); - } - } - suffix_index_to_protein.shrink_to_fit(); - SparseSuffixToProtein { - mapping: suffix_index_to_protein, - } - } - } -} -/// Represents a suffix array. -pub enum SuffixArray { - /// The original suffix array. - Original(Vec), - /// The compressed suffix array. - Compressed(BitArray), -} -impl SuffixArray { - /// Returns the length of the suffix array. - /// - /// # Returns - /// - /// The length of the suffix array. - pub fn len(&self) -> usize { - match self { - SuffixArray::Original(sa) => sa.len(), - SuffixArray::Compressed(sa) => sa.len(), - } - } - /// Returns the suffix array at the given index. - /// - /// # Arguments - /// - /// * `index` - The index of the suffix array. - /// - /// # Returns - /// - /// The suffix array at the given index. - pub fn get(&self, index: usize) -> i64 { - match self { - SuffixArray::Original(sa) => sa[index], - SuffixArray::Compressed(sa) => sa.get(index) as i64, - } - } - /// Returns whether the suffix array is empty. - /// - /// # Returns - /// - /// True if the suffix array is empty, false otherwise. - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} -/// Custom trait implemented by types that have a value that represents NULL -pub trait Nullable { - const NULL: T; - /// Returns whether the value is NULL. - /// - /// # Returns - /// - /// True if the value is NULL, false otherwise. - fn is_null(&self) -> bool; -} -/// Implementation of the `Nullable` trait for the `u32` type. -impl Nullable for u32 { - const NULL: u32 = u32::MAX; - fn is_null(&self) -> bool { - *self == Self::NULL - } -} -pub(crate) use define_struct; From 64a0aba47ab51bf2ced9df9057b7118e5168de43 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Mon, 27 May 2024 12:01:47 +0200 Subject: [PATCH 23/26] some debug information --- sa-builder/src/main.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index a78fdce..a20702a 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -44,12 +44,18 @@ fn main() { let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("Suffix array constructed successfully."); + eprintln!("sa length: {}", sa.len()); + // open the output file let mut file = open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); if compress_sa { let bits_per_value = (data.len() as f64).log2().ceil() as usize; + + eprintln!("Compressing suffix array with {} bits per value.", bits_per_value); + if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { From 7096a0d5f2a1abf6a374d2256188327c20faea24 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Tue, 28 May 2024 14:33:56 +0200 Subject: [PATCH 24/26] allow preliminary enzyme numbers (x.x.x.nx) --- fa-compression/src/algorithm1/decode.rs | 14 +++++++------- fa-compression/src/algorithm1/encode.rs | 14 +++++++------- fa-compression/src/algorithm1/mod.rs | 22 ++++++++++------------ 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/fa-compression/src/algorithm1/decode.rs b/fa-compression/src/algorithm1/decode.rs index 453f107..c24fc4b 100644 --- a/fa-compression/src/algorithm1/decode.rs +++ b/fa-compression/src/algorithm1/decode.rs @@ -82,23 +82,23 @@ mod tests { #[test] fn test_decode_single_ec() { - assert_eq!(decode(&[44, 44, 44, 189, 208]), "EC:1.1.1.-") + assert_eq!(decode(&[44, 44, 44, 190, 224]), "EC:1.1.1.-") } #[test] fn test_decode_single_go() { - assert_eq!(decode(&[209, 17, 163, 138, 208]), "GO:0009279") + assert_eq!(decode(&[225, 17, 163, 138, 224]), "GO:0009279") } #[test] fn test_decode_single_ipr() { - assert_eq!(decode(&[221, 18, 116, 117]), "IPR:IPR016364") + assert_eq!(decode(&[238, 18, 116, 117]), "IPR:IPR016364") } #[test] fn test_decode_no_ec() { assert_eq!( - decode(&[209, 17, 163, 138, 209, 39, 71, 94, 17, 153, 39]), + decode(&[225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39]), "GO:0009279;IPR:IPR016364;IPR:IPR008816" ) } @@ -106,7 +106,7 @@ mod tests { #[test] fn test_decode_no_go() { assert_eq!( - decode(&[44, 44, 44, 190, 44, 60, 44, 141, 209, 39, 71, 80]), + decode(&[44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80]), "EC:1.1.1.-;EC:1.2.1.7;IPR:IPR016364" ) } @@ -114,7 +114,7 @@ mod tests { #[test] fn test_decode_no_ipr() { assert_eq!( - decode(&[44, 44, 44, 189, 17, 26, 56, 174, 17, 26, 56, 173]), + decode(&[44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174]), "EC:1.1.1.-;GO:0009279;GO:0009279" ) } @@ -123,7 +123,7 @@ mod tests { fn test_decode_all() { assert_eq!( decode(&[ - 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39 + 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39 ]), "EC:1.1.1.-;GO:0009279;IPR:IPR016364;IPR:IPR032635;IPR:IPR008816" ) diff --git a/fa-compression/src/algorithm1/encode.rs b/fa-compression/src/algorithm1/encode.rs index 8cce9cd..e33351d 100644 --- a/fa-compression/src/algorithm1/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -94,24 +94,24 @@ mod tests { #[test] fn test_encode_single_ec() { - assert_eq!(encode("EC:1.1.1.-"), vec![44, 44, 44, 189, 208]) + assert_eq!(encode("EC:1.1.1.-"), vec![44, 44, 44, 190, 224]) } #[test] fn test_encode_single_go() { - assert_eq!(encode("GO:0009279"), vec![209, 17, 163, 138, 208]) + assert_eq!(encode("GO:0009279"), vec![225, 17, 163, 138, 224]) } #[test] fn test_encode_single_ipr() { - assert_eq!(encode("IPR:IPR016364"), vec![221, 18, 116, 117]) + assert_eq!(encode("IPR:IPR016364"), vec![238, 18, 116, 117]) } #[test] fn test_encode_no_ec() { assert_eq!( encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), - vec![209, 17, 163, 138, 209, 39, 71, 94, 17, 153, 39] + vec![225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39] ) } @@ -119,7 +119,7 @@ mod tests { fn test_encode_no_go() { assert_eq!( encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), - vec![44, 44, 44, 190, 44, 60, 44, 141, 209, 39, 71, 80] + vec![44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80] ) } @@ -127,7 +127,7 @@ mod tests { fn test_encode_no_ipr() { assert_eq!( encode("EC:1.1.1.-;GO:0009279;GO:0009279"), - vec![44, 44, 44, 189, 17, 26, 56, 174, 17, 26, 56, 173] + vec![44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174] ) } @@ -135,7 +135,7 @@ mod tests { fn test_encode_all() { assert_eq!( encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), - vec![44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39] + vec![44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39] ) } } diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs index 3a1d9b4..5c62e7c 100644 --- a/fa-compression/src/algorithm1/mod.rs +++ b/fa-compression/src/algorithm1/mod.rs @@ -73,6 +73,7 @@ enum CharacterSet { /// Special Enzyme Commission characters Dash, Point, + Preliminary, /// Different annotation type separator Comma, @@ -106,6 +107,7 @@ impl Encode for CharacterSet { b'9' => CharacterSet::Nine, b'-' => CharacterSet::Dash, b'.' => CharacterSet::Point, + b'n' => CharacterSet::Preliminary, b',' => CharacterSet::Comma, b';' => CharacterSet::Semicolon, _ => panic!("Invalid character") @@ -138,8 +140,9 @@ impl Decode for CharacterSet { 10 => '9', 11 => '-', 12 => '.', - 13 => ',', - 14 => ';', + 13 => 'n', + 14 => ',', + 15 => ';', _ => panic!("Invalid character") } } @@ -167,10 +170,10 @@ impl BitOr for CharacterSet { mod tests { use super::*; - static CHARACTERS: [u8; 15] = - [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b',', b';']; + static CHARACTERS: [u8; 16] = + [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', b';']; - static CHARACTER_SETS: [CharacterSet; 15] = [ + static CHARACTER_SETS: [CharacterSet; 16] = [ CharacterSet::Empty, CharacterSet::Zero, CharacterSet::One, @@ -184,6 +187,7 @@ mod tests { CharacterSet::Nine, CharacterSet::Dash, CharacterSet::Point, + CharacterSet::Preliminary, CharacterSet::Comma, CharacterSet::Semicolon ]; @@ -233,13 +237,7 @@ mod tests { #[test] #[should_panic] fn test_decode_invalid() { - CharacterSet::decode(15); - } - - #[test] - #[should_panic] - fn test_decode_pair_invalid() { - CharacterSet::decode_pair(0b11111111); + CharacterSet::decode(16); } #[test] From 285f246a31951ea1b4534adb95ed01a7604d5993 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Tue, 28 May 2024 14:49:52 +0200 Subject: [PATCH 25/26] encode functional annotations on load --- sa-mappings/src/proteins.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index 900c531..87178b0 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -10,7 +10,7 @@ use std::{ }; use bytelines::ByteLines; -use fa_compression::algorithm1::decode; +use fa_compression::algorithm1::{decode, encode}; use umgap::taxon::TaxonId; use crate::taxonomy::TaxonAggregator; @@ -84,7 +84,7 @@ impl Proteins { let uniprot_id = from_utf8(fields.next().unwrap())?; let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; let sequence = from_utf8(fields.next().unwrap())?; - let functional_annotations: Vec = fields.next().unwrap().to_vec(); + let functional_annotations: Vec = encode(from_utf8(fields.next().unwrap())?); if !taxon_aggregator.taxon_exists(taxon_id) { continue; From 859a66bb67cc5fe05873e6ef9afc823986b30390 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Tue, 28 May 2024 18:18:29 +0200 Subject: [PATCH 26/26] more debug info + some movement of SA code --- .gitignore | 3 + Cargo.lock | 1 + bitarray/src/lib.rs | 15 +++++ fa-compression/src/algorithm1/decode.rs | 4 +- fa-compression/src/algorithm1/encode.rs | 2 +- fa-compression/src/algorithm1/mod.rs | 6 +- sa-builder/src/main.rs | 34 ++++++++--- sa-compression/Cargo.toml | 1 + sa-compression/src/lib.rs | 15 ++--- sa-index/src/binary.rs | 14 +++-- sa-index/src/lib.rs | 78 +++++++++++++++++++++---- sa-index/src/peptide_search.rs | 2 +- sa-index/src/sa_searcher.rs | 42 +++++-------- sa-mappings/src/proteins.rs | 38 ++++++------ sa-server/src/main.rs | 37 +++++++----- 15 files changed, 194 insertions(+), 98 deletions(-) diff --git a/.gitignore b/.gitignore index 2f7896d..f534053 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ target/ +data/ + +.DS_Store diff --git a/Cargo.lock b/Cargo.lock index 8b2e119..900c218 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1422,6 +1422,7 @@ name = "sa-compression" version = "0.1.0" dependencies = [ "bitarray", + "sa-index", ] [[package]] diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index ce2d9ef..0a5f647 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -120,6 +120,15 @@ impl BitArray { self.data[end_block] |= value << (64 - end_block_offset); } + /// Returns the number of bits in a single value. + /// + /// # Returns + /// + /// The number of bits in a single value. + pub fn bits_per_value(&self) -> usize { + self.bits_per_value + } + /// Returns the length of the `BitArray`. /// /// # Returns @@ -266,6 +275,12 @@ mod tests { assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]); } + #[test] + fn test_bitarray_bits_per_value() { + let bitarray = BitArray::with_capacity(4, 40); + assert_eq!(bitarray.bits_per_value(), 40); + } + #[test] fn test_bitarray_len() { let bitarray = BitArray::with_capacity(4, 40); diff --git a/fa-compression/src/algorithm1/decode.rs b/fa-compression/src/algorithm1/decode.rs index c24fc4b..5295a58 100644 --- a/fa-compression/src/algorithm1/decode.rs +++ b/fa-compression/src/algorithm1/decode.rs @@ -28,9 +28,9 @@ static PREFIXES: [&str; 3] = ["EC:", "GO:", "IPR:IPR"]; /// ``` /// use fa_compression::algorithm1::decode; /// -/// let input = &[ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39 ]; +/// let input = &[ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ]; /// let result = decode(input); -/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364;IPR:IPR032635;IPR:IPR008816"); +/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364"); /// ``` pub fn decode(input: &[u8]) -> String { if input.is_empty() { diff --git a/fa-compression/src/algorithm1/encode.rs b/fa-compression/src/algorithm1/encode.rs index e33351d..0877c9a 100644 --- a/fa-compression/src/algorithm1/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -28,7 +28,7 @@ use super::{ /// let input = "IPR:IPR016364;EC:1.1.1.-;GO:0009279"; /// let encoded = encode(input); /// -/// assert_eq!(encoded, vec![ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117 ]); +/// assert_eq!(encoded, vec![ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ]); /// ``` pub fn encode(input: &str) -> Vec { if input.is_empty() { diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs index 5c62e7c..a495c9e 100644 --- a/fa-compression/src/algorithm1/mod.rs +++ b/fa-compression/src/algorithm1/mod.rs @@ -170,8 +170,10 @@ impl BitOr for CharacterSet { mod tests { use super::*; - static CHARACTERS: [u8; 16] = - [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', b';']; + static CHARACTERS: [u8; 16] = [ + b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', + b';' + ]; static CHARACTER_SETS: [CharacterSet; 16] = [ CharacterSet::Empty, diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index a20702a..812bd7d 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -31,38 +31,54 @@ fn main() { compress_sa } = Arguments::parse(); + eprintln!(); + eprintln!("📋 Started loading the taxon file..."); let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("✅ Successfully loaded the taxon file!"); + eprintln!("\tAggregation method: LCA*"); - // read input + eprintln!(); + eprintln!("📋 Started loading the proteins..."); let mut data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("✅ Successfully loaded the proteins!"); - // calculate sparse suffix array + eprintln!(); + eprintln!("📋 Started building the suffix array..."); let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - - eprintln!("Suffix array constructed successfully."); - eprintln!("sa length: {}", sa.len()); + eprintln!("✅ Successfully built the suffix array!"); + eprintln!("\tAmount of items: {}", sa.len()); + eprintln!("\tSample rate: {}", sparseness_factor); // open the output file let mut file = open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!(); + eprintln!("📋 Started dumping the suffix array..."); + if compress_sa { let bits_per_value = (data.len() as f64).log2().ceil() as usize; - eprintln!("Compressing suffix array with {} bits per value.", bits_per_value); - if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { eprint_and_exit(err.to_string().as_str()); }; - } else if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { - eprint_and_exit(err.to_string().as_str()); + + eprintln!("✅ Successfully dumped the suffix array!"); + eprintln!("\tAmount of bits per item: {}", bits_per_value); + } else { + if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { + eprint_and_exit(err.to_string().as_str()); + } + + eprintln!("✅ Successfully dumped the suffix array!"); + eprintln!("\tAmount of bits per item: 64"); } } diff --git a/sa-compression/Cargo.toml b/sa-compression/Cargo.toml index a53939b..70a6cbf 100644 --- a/sa-compression/Cargo.toml +++ b/sa-compression/Cargo.toml @@ -7,3 +7,4 @@ edition = "2021" [dependencies] bitarray = { path = "../bitarray" } +sa-index = { path = "../sa-index" } diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index b7bac78..85a41df 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -11,6 +11,7 @@ use bitarray::{ Binary, BitArray }; +use sa_index::SuffixArray; /// Writes the compressed suffix array to a writer. /// @@ -66,7 +67,7 @@ pub fn dump_compressed_suffix_array( pub fn load_compressed_suffix_array( reader: &mut impl BufRead, bits_per_value: usize -) -> Result<(u8, BitArray), Box> { +) -> Result> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; reader @@ -87,7 +88,7 @@ pub fn load_compressed_suffix_array( .read_binary(reader) .map_err(|_| "Could not read the compressed suffix array from the binary file")?; - Ok((sample_rate, compressed_suffix_array)) + Ok(SuffixArray::Compressed(compressed_suffix_array, sample_rate)) } #[cfg(test)] @@ -209,12 +210,11 @@ mod tests { ]; let mut reader = std::io::BufReader::new(&data[..]); - let (sample_rate, compressed_suffix_array) = - load_compressed_suffix_array(&mut reader, 8).unwrap(); + let compressed_suffix_array = load_compressed_suffix_array(&mut reader, 8).unwrap(); - assert_eq!(sample_rate, 1); + assert_eq!(compressed_suffix_array.sample_rate(), 1); for i in 0 .. 10 { - assert_eq!(compressed_suffix_array.get(i), i as u64 + 1); + assert_eq!(compressed_suffix_array.get(i), i as i64 + 1); } } @@ -262,7 +262,8 @@ mod tests { let mut reader = FailingReader { valid_read_count: 0 }; - assert_eq!(reader.fill_buf().unwrap(), &[]); + let right_buffer: [u8; 0] = []; + assert_eq!(reader.fill_buf().unwrap(), &right_buffer); assert_eq!(reader.consume(0), ()); let mut buffer = [0_u8; 1]; assert!(reader.read(&mut buffer).is_err()); diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index ae16a8f..5688d4a 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -7,6 +7,8 @@ use std::{ } }; +use crate::SuffixArray; + /// The `Binary` trait provides methods for reading and writing a struct as binary. pub trait Binary { /// Writes the struct as binary to the given writer. @@ -132,7 +134,7 @@ pub fn dump_suffix_array( /// # Errors /// /// Returns any error from opening the file or reading the file -pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Box> { +pub fn load_suffix_array(reader: &mut impl BufRead) -> Result> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; reader @@ -151,7 +153,7 @@ pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Bo sa.read_binary(reader) .map_err(|_| "Could not read the suffix array from the binary file")?; - Ok((sample_rate, sa)) + Ok(SuffixArray::Original(sa, sample_rate)) } /// Fills the buffer with data read from the input. @@ -374,10 +376,12 @@ mod tests { ]; let mut reader = buffer.as_slice(); - let (sample_rate, sa) = load_suffix_array(&mut reader).unwrap(); + let sa = load_suffix_array(&mut reader).unwrap(); - assert_eq!(sample_rate, 1); - assert_eq!(sa, vec![1, 2, 3, 4, 5]); + assert_eq!(sa.sample_rate(), 1); + for i in 0 .. 5 { + assert_eq!(sa.get(i), i as i64 + 1); + } } #[test] diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index ca13a82..f276906 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -8,9 +8,9 @@ pub mod suffix_to_protein_index; /// Represents a suffix array. pub enum SuffixArray { /// The original suffix array. - Original(Vec), + Original(Vec, u8), /// The compressed suffix array. - Compressed(BitArray) + Compressed(BitArray, u8) } impl SuffixArray { @@ -21,12 +21,36 @@ impl SuffixArray { /// The length of the suffix array. pub fn len(&self) -> usize { match self { - SuffixArray::Original(sa) => sa.len(), - SuffixArray::Compressed(sa) => sa.len() + SuffixArray::Original(sa, _) => sa.len(), + SuffixArray::Compressed(sa, _) => sa.len() } } - /// Returns the suffix array at the given index. + /// Returns the number of bits per value in the suffix array. + /// + /// # Returns + /// + /// The number of bits per value in the suffix array. + pub fn bits_per_value(&self) -> usize { + match self { + SuffixArray::Original(_, _) => 64, + SuffixArray::Compressed(sa, _) => sa.bits_per_value() + } + } + + /// Returns the sample rate used for the suffix array. + /// + /// # Returns + /// + /// The sample rate used for the suffix array. + pub fn sample_rate(&self) -> u8 { + match self { + SuffixArray::Original(_, sample_rate) => *sample_rate, + SuffixArray::Compressed(_, sample_rate) => *sample_rate + } + } + + /// Returns the suffix array value at the given index. /// /// # Arguments /// @@ -37,8 +61,8 @@ impl SuffixArray { /// The suffix array at the given index. pub fn get(&self, index: usize) -> i64 { match self { - SuffixArray::Original(sa) => sa[index], - SuffixArray::Compressed(sa) => sa.get(index) as i64 + SuffixArray::Original(sa, _) => sa[index], + SuffixArray::Compressed(sa, _) => sa.get(index) as i64 } } @@ -46,7 +70,7 @@ impl SuffixArray { /// /// # Returns /// - /// True if the suffix array is empty, false otherwise. + /// Returns `true` if the suffix array is empty, `false` otherwise. pub fn is_empty(&self) -> bool { self.len() == 0 } @@ -79,7 +103,7 @@ mod tests { #[test] fn test_suffix_array_original() { - let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5]); + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); assert_eq!(sa.len(), 5); assert_eq!(sa.get(0), 1); assert_eq!(sa.get(1), 2); @@ -97,7 +121,7 @@ mod tests { bitarray.set(3, 4); bitarray.set(4, 5); - let sa = SuffixArray::Compressed(bitarray); + let sa = SuffixArray::Compressed(bitarray, 1); assert_eq!(sa.len(), 5); assert_eq!(sa.get(0), 1); assert_eq!(sa.get(1), 2); @@ -106,13 +130,43 @@ mod tests { assert_eq!(sa.get(4), 5); } + #[test] + fn test_suffix_array_len() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.len(), 5); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.len(), 5); + } + + #[test] + fn test_suffix_array_bits_per_value() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.bits_per_value(), 64); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.bits_per_value(), 40); + } + + #[test] + fn test_suffix_array_sample_rate() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.sample_rate(), 1); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.sample_rate(), 1); + } + #[test] fn test_suffix_array_is_empty() { - let sa = SuffixArray::Original(vec![]); + let sa = SuffixArray::Original(vec![], 1); assert_eq!(sa.is_empty(), true); let bitarray = BitArray::with_capacity(0, 0); - let sa = SuffixArray::Compressed(bitarray); + let sa = SuffixArray::Compressed(bitarray, 1); assert_eq!(sa.is_empty(), true); } diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 623eba6..7d83914 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -70,7 +70,7 @@ pub fn search_proteins_for_peptide<'a>( let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); // words that are shorter than the sample rate are not searchable - if peptide.len() < searcher.sparseness_factor as usize { + if peptide.len() < searcher.sa.sample_rate() as usize { return None; } diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 3c82ea4..29bbc9a 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -102,7 +102,6 @@ impl PartialEq for SearchAllSuffixesResult { /// the functional analysis provided by Unipept pub struct Searcher { pub sa: SuffixArray, - pub sparseness_factor: u8, pub suffix_index_to_protein: Box, pub proteins: Proteins, pub taxon_id_calculator: TaxonAggregator, @@ -128,7 +127,6 @@ impl Searcher { /// Returns a new Searcher object pub fn new( sa: SuffixArray, - sparseness_factor: u8, suffix_index_to_protein: Box, proteins: Proteins, taxon_id_calculator: TaxonAggregator, @@ -136,7 +134,6 @@ impl Searcher { ) -> Self { Self { sa, - sparseness_factor, suffix_index_to_protein, proteins, taxon_id_calculator, @@ -324,7 +321,7 @@ impl Searcher { } let mut skip: usize = 0; - while skip < self.sparseness_factor as usize { + while skip < self.sa.sample_rate() as usize { let mut il_locations_start = 0; while il_locations_start < il_locations.len() && il_locations[il_locations_start] < skip { @@ -654,16 +651,16 @@ mod tests { #[test] fn test_search_simple() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![ - 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, - ]); + let sa = SuffixArray::Original( + vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], + 1 + ); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -690,14 +687,13 @@ mod tests { #[test] fn test_search_sparse() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18]); + let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 3, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -721,16 +717,16 @@ mod tests { #[test] fn test_il_equality() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![ - 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, - ]); + let sa = SuffixArray::Original( + vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], + 1 + ); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -752,14 +748,13 @@ mod tests { #[test] fn test_il_equality_sparse() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18]); + let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 3, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -798,10 +793,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![0, 2, 4]); + let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); let searcher = Searcher::new( sparse_sa, - 2, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -833,10 +827,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2]); + let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -867,10 +860,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0]); + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -901,10 +893,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0]); + let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); let searcher = Searcher::new( sparse_sa, - 2, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -937,10 +928,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0]); + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index 87178b0..e1d2f49 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -10,7 +10,10 @@ use std::{ }; use bytelines::ByteLines; -use fa_compression::algorithm1::{decode, encode}; +use fa_compression::algorithm1::{ + decode, + encode +}; use umgap::taxon::TaxonId; use crate::taxonomy::TaxonAggregator; @@ -185,26 +188,21 @@ mod tests { let database_file = tmp_dir.path().join("database.tsv"); let mut file = File::create(&database_file).unwrap(); - file.write("P12345\t1\tMLPGLALLLLAAWTARALEV\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P67890\t6\tKWDSDPSGTKTCIDT\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) + file.write( + "P12345\t1\tMLPGLALLLLAAWTARALEV\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() + ) + .unwrap(); + file.write( + "P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n" + .as_bytes() + ) + .unwrap(); + file.write( + "P67890\t6\tKWDSDPSGTKTCIDT\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() + ) + .unwrap(); + file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes()) .unwrap(); - file.write("\n".as_bytes()).unwrap(); database_file } diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index 7fb3322..ef774b2 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -175,23 +175,34 @@ async fn start_server(args: Arguments) -> Result<(), Box> { taxonomy } = args; - eprintln!("Loading suffix array..."); - let (sample_rate, sa) = load_suffix_array_file(&index_file)?; - - eprintln!("Loading taxon file..."); + eprintln!(); + eprintln!("📋 Started loading the suffix array..."); + let sa = load_suffix_array_file(&index_file)?; + eprintln!("✅ Successfully loaded the suffix array!"); + eprintln!("\tAmount of items: {}", sa.len()); + eprintln!("\tAmount of bits per item: {}", sa.bits_per_value()); + eprintln!("\tSample rate: {}", sa.sample_rate()); + + eprintln!(); + eprintln!("📋 Started loading the taxon file..."); let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)?; + eprintln!("✅ Successfully loaded the taxon file!"); + eprintln!("\tAggregation method: LCA*"); + eprintln!(); + eprintln!("📋 Started creating the function aggregator..."); let function_aggregator = FunctionAggregator {}; + eprintln!("✅ Successfully created the function aggregator!"); - eprintln!("Loading proteins..."); + eprintln!(); + eprintln!("📋 Started loading the proteins..."); let proteins = Proteins::try_from_database_file(&database_file, &taxon_id_calculator)?; let suffix_index_to_protein = Box::new(SparseSuffixToProtein::new(&proteins.input_string)); + eprintln!("✅ Successfully loaded the proteins!"); - eprintln!("Creating searcher..."); let searcher = Arc::new(Searcher::new( sa, - sample_rate, suffix_index_to_protein, proteins, taxon_id_calculator, @@ -212,13 +223,15 @@ async fn start_server(args: Arguments) -> Result<(), Box> { .with_state(searcher); let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await?; - println!("server is ready..."); + + eprintln!(); + eprintln!("🚀 Server is ready..."); axum::serve(listener, app).await?; Ok(()) } -fn load_suffix_array_file(file: &str) -> Result<(u8, SuffixArray), Box> { +fn load_suffix_array_file(file: &str) -> Result> { // Open the suffix array file let mut sa_file = File::open(file)?; @@ -233,10 +246,8 @@ fn load_suffix_array_file(file: &str) -> Result<(u8, SuffixArray), Box