diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index bf98de4..f015f40 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -19,6 +19,33 @@ jobs: toolchain: nightly override: true + + + - name: Run cargo test (bitarray) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p bitarray + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (bitarray) + id: coverage-bitarray + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (bitarray) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-bitarray.outputs.report }} + flags: bitarray + verbose: true + fail_ci_if_error: true + + + - name: Run cargo test (fa-compression) uses: actions-rs/cargo@v1 with: @@ -42,6 +69,82 @@ jobs: verbose: true fail_ci_if_error: true + + + - name: Run cargo test (sa-builder) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p sa-builder + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (sa-builder) + id: coverage-sa-builder + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (sa-builder) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-sa-builder.outputs.report }} + flags: sa-builder + verbose: true + fail_ci_if_error: true + + + + - name: Run cargo test (sa-compression) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p sa-compression + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (sa-compression) + id: coverage-sa-compression + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (sa-compression) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-sa-compression.outputs.report }} + flags: sa-compression + verbose: true + fail_ci_if_error: true + + + + - name: Run cargo test (sa-index) + uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast -p sa-index + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: Gather coverage information (sa-index) + id: coverage-sa-index + uses: actions-rs/grcov@v0.1 + + - name: Upload coverage reports to Codecov (sa-index) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ${{ steps.coverage-sa-index.outputs.report }} + flags: sa-index + verbose: true + fail_ci_if_error: true + + - name: Run cargo test (sa-mappings) uses: actions-rs/cargo@v1 with: @@ -63,4 +166,4 @@ jobs: file: ${{ steps.coverage-sa-mappings.outputs.report }} flags: sa-mappings verbose: true - fail_ci_if_error: true + fail_ci_if_error: true \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2f7896d..f534053 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ target/ +data/ + +.DS_Store diff --git a/Cargo.lock b/Cargo.lock index d5ed545..900c218 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,6 +240,10 @@ dependencies = [ "which", ] +[[package]] +name = "bitarray" +version = "0.1.0" + [[package]] name = "bitflags" version = "1.3.2" @@ -1408,16 +1412,27 @@ dependencies = [ "clap 4.5.4", "libdivsufsort-rs", "libsais64-rs", + "sa-compression", + "sa-index", "sa-mappings", ] +[[package]] +name = "sa-compression" +version = "0.1.0" +dependencies = [ + "bitarray", + "sa-index", +] + [[package]] name = "sa-index" version = "0.1.0" dependencies = [ + "bitarray", "clap 4.5.4", + "fa-compression", "rayon", - "sa-builder", "sa-mappings", "serde", "serde_json", @@ -1444,6 +1459,7 @@ dependencies = [ "axum", "clap 4.5.4", "sa-builder", + "sa-compression", "sa-index", "sa-mappings", "serde", diff --git a/Cargo.toml b/Cargo.toml index 728b06e..fec447c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,10 @@ [workspace] resolver = "2" -members = [ +members = [ "bitarray", "fa-compression", "libsais64-rs", - "sa-builder", + "sa-builder", "sa-compression", "sa-index", "sa-mappings", "sa-server" diff --git a/README.md b/README.md index 9a4607c..15f8897 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,20 @@ ![Codecov](https://img.shields.io/codecov/c/github/unipept/unipept-index?token=IZ75A2FY98&logo=Codecov) -The unipept index written entirely in `Rust`. This repository consists of multiple different Rust projects that depend on -each other. More information about each project can be found in their respective `README.md` file. +The unipept index written entirely in `Rust`. This repository consists of multiple different Rust projects that depend on each other. More information about each project can be found in their respective `README.md` file. ## Installation -Clone this repository with the following command: +> [!NOTE] +> To build and use the Unipept Index, you need to have Rust installed. If you don't have Rust installed, you can get it from [rust-lang.org](https://www.rust-lang.org/). + +Clone this repository by executing the following command: ```bash git clone https://github.com/unipept/unipept-index.git ``` -And build the projects using: +Next, build everything by executing the following command in the root of the repository. ```bash cargo build --release diff --git a/bitarray/Cargo.toml b/bitarray/Cargo.toml new file mode 100644 index 0000000..8176d57 --- /dev/null +++ b/bitarray/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "bitarray" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/bitarray/README.md b/bitarray/README.md new file mode 100644 index 0000000..074f7c5 --- /dev/null +++ b/bitarray/README.md @@ -0,0 +1,21 @@ +# Bitarray + +![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/unipept/unipept-index/test.yml?logo=github) +![Codecov](https://img.shields.io/codecov/c/github/unipept/unipept-index?token=IZ75A2FY98&flag=bitarray&logo=codecov) +![Static Badge](https://img.shields.io/badge/doc-rustdoc-blue) + +The `bitarray` offers a special array where each item is represented by a specified amount of bits (smaller than 64). The bitarray uses a pre-alocated vector and allows you to `set` or `get` a value from the array. + +## Example + +```rust +use bitarray; + +fn main() { + let bitarray = BitArray::<40>::with_capacity(4); + + bitarray.set(0, 0b0001110011111010110001000111111100110010); + + assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010); +} +``` diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs new file mode 100644 index 0000000..2609c3b --- /dev/null +++ b/bitarray/src/binary.rs @@ -0,0 +1,203 @@ +//! This module provides utilities for reading and writing the bitarray as binary. + +use std::io::{ + BufRead, + Read, + Result, + Write +}; + +use crate::BitArray; + +/// The `Binary` trait provides methods for reading and writing a struct as binary. +pub trait Binary { + /// Writes the struct as binary to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. + fn write_binary(&self, writer: &mut W) -> Result<()>; + + /// Reads binary data into a struct from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader to read the binary data from. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `Err` if an error occurs. + fn read_binary(&mut self, reader: R) -> Result<()>; +} + +/// Implementation of the `Binary` trait for the `BitArray` struct. +impl Binary for BitArray { + /// Writes the binary representation of the `BitArray` to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to which the binary data will be written. + /// + /// # Errors + /// + /// Returns an error if there was a problem writing to the writer. + fn write_binary(&self, writer: &mut W) -> Result<()> { + for value in self.data.iter() { + writer.write_all(&value.to_le_bytes())?; + } + + Ok(()) + } + + /// Reads the binary representation of the `BitArray` from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader from which the binary data will be read. + /// + /// # Errors + /// + /// Returns an error if there was a problem reading from the reader. + fn read_binary(&mut self, mut reader: R) -> Result<()> { + self.data.clear(); + + let mut buffer = vec![0; 8 * 1024]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; + for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { + self.data + .push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); + } + + if finished { + break; + } + } + + Ok(()) + } +} + +/// Fills the buffer with data read from the input. +/// +/// # Arguments +/// +/// * `input` - The input source to read data from. +/// * `buffer` - The buffer to fill with data. +/// +/// # Returns +/// +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input +/// is reached, and `bytes_read` is the number of bytes read into the buffer. +fn fill_buffer(input: &mut T, buffer: &mut Vec) -> std::io::Result<(bool, usize)> { + // Store the buffer size in advance, because rust will complain + // about the buffer being borrowed mutably while it's borrowed + let buffer_size = buffer.len(); + + let mut writable_buffer_space = buffer.as_mut(); + + loop { + match input.read(writable_buffer_space) { + // No bytes written, which means we've completely filled the buffer + // or we've reached the end of the file + Ok(0) => { + return Ok(( + !writable_buffer_space.is_empty(), + buffer_size - writable_buffer_space.len() + )); + } + + // We've read {bytes_read} bytes + Ok(bytes_read) => { + // Shrink the writable buffer slice + writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); + } + + // An error occurred while reading + Err(e) => { + return Err(e); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + pub struct ErrorInput; + + impl Read for ErrorInput { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + Err(std::io::Error::new(std::io::ErrorKind::Other, "read error")) + } + } + + #[test] + fn test_fill_buffer() { + let input_str = "a".repeat(8_000); + let mut input = input_str.as_bytes(); + + let mut buffer = vec![0; 800]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer).unwrap(); + + if finished { + assert!(bytes_read < 800); + break; + } else { + assert_eq!(bytes_read, 800); + } + } + } + + #[test] + fn test_fill_buffer_read_error() { + let mut input = ErrorInput; + let mut buffer = vec![0; 800]; + + assert!(fill_buffer(&mut input, &mut buffer).is_err()); + } + + #[test] + fn test_write_binary() { + let mut bitarray = BitArray::with_capacity(4, 40); + bitarray.set(0, 0x1234567890); + bitarray.set(1, 0xabcdef0123); + bitarray.set(2, 0x4567890abc); + bitarray.set(3, 0xdef0123456); + + let mut buffer = Vec::new(); + bitarray.write_binary(&mut buffer).unwrap(); + + assert_eq!( + buffer, + vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ] + ); + } + + #[test] + fn test_read_binary() { + let buffer = vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0, + ]; + + let mut bitarray = BitArray::with_capacity(4, 40); + bitarray.read_binary(&buffer[..]).unwrap(); + + assert_eq!(bitarray.get(0), 0x1234567890); + assert_eq!(bitarray.get(1), 0xabcdef0123); + assert_eq!(bitarray.get(2), 0x4567890abc); + assert_eq!(bitarray.get(3), 0xdef0123456); + } +} diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs new file mode 100644 index 0000000..0a5f647 --- /dev/null +++ b/bitarray/src/lib.rs @@ -0,0 +1,410 @@ +//! This module contains the `BitArray` struct and its associated methods. + +mod binary; + +use std::{ + cmp::max, + io::{ + Result, + Write + } +}; + +/// Re-export the `Binary` trait. +pub use binary::Binary; + +/// A fixed-size bit array implementation. +pub struct BitArray { + /// The underlying data storage for the bit array. + data: Vec, + /// The mask used to extract the relevant bits from each element in the data vector. + mask: u64, + /// The length of the bit array. + len: usize, + /// The number of bits in a single element of the data vector. + bits_per_value: usize +} + +impl BitArray { + /// Creates a new `BitArray` with the specified capacity. + /// + /// # Arguments + /// + /// * `capacity` - The number of bits the `BitArray` can hold. + /// * `bits_per_value` - The number of bits in a single value. + /// + /// # Returns + /// + /// A new `BitArray` with the specified capacity. + pub fn with_capacity(capacity: usize, bits_per_value: usize) -> Self { + let extra = if capacity * bits_per_value % 64 == 0 { + 0 + } else { + 1 + }; + Self { + data: vec![0; capacity * bits_per_value / 64 + extra], + mask: (1 << bits_per_value) - 1, + len: capacity, + bits_per_value + } + } + + /// Retrieves the value at the specified index in the `BitArray`. + /// + /// # Arguments + /// + /// * `index` - The index of the value to retrieve. + /// + /// # Returns + /// + /// The value at the specified index. + pub fn get(&self, index: usize) -> u64 { + let start_block = index * self.bits_per_value / 64; + let start_block_offset = index * self.bits_per_value % 64; + + // If the value is contained within a single block + if start_block_offset + self.bits_per_value <= 64 { + // Shift the value to the right so that the relevant bits are in the least significant + // position Then mask out the irrelevant bits + return self.data[start_block] >> (64 - start_block_offset - self.bits_per_value) + & self.mask; + } + + let end_block = (index + 1) * self.bits_per_value / 64; + let end_block_offset = (index + 1) * self.bits_per_value % 64; + + // Extract the relevant bits from the start block and shift them {end_block_offset} bits to + // the left + let a = self.data[start_block] << end_block_offset; + + // Extract the relevant bits from the end block and shift them to the least significant + // position + let b = self.data[end_block] >> (64 - end_block_offset); + + // Paste the two values together and mask out the irrelevant bits + (a | b) & self.mask + } + + /// Sets the value at the specified index in the `BitArray`. + /// + /// # Arguments + /// + /// * `index` - The index of the value to set. + /// * `value` - The value to set at the specified index. + pub fn set(&mut self, index: usize, value: u64) { + let start_block = index * self.bits_per_value / 64; + let start_block_offset = index * self.bits_per_value % 64; + + // If the value is contained within a single block + if start_block_offset + self.bits_per_value <= 64 { + // Clear the relevant bits in the start block + self.data[start_block] &= + !(self.mask << (64 - start_block_offset - self.bits_per_value)); + // Set the relevant bits in the start block + self.data[start_block] |= value << (64 - start_block_offset - self.bits_per_value); + return; + } + + let end_block = (index + 1) * self.bits_per_value / 64; + let end_block_offset = (index + 1) * self.bits_per_value % 64; + + // Clear the relevant bits in the start block + self.data[start_block] &= !(self.mask >> start_block_offset); + // Set the relevant bits in the start block + self.data[start_block] |= value >> end_block_offset; + + // Clear the relevant bits in the end block + self.data[end_block] &= !(self.mask << (64 - end_block_offset)); + // Set the relevant bits in the end block + self.data[end_block] |= value << (64 - end_block_offset); + } + + /// Returns the number of bits in a single value. + /// + /// # Returns + /// + /// The number of bits in a single value. + pub fn bits_per_value(&self) -> usize { + self.bits_per_value + } + + /// Returns the length of the `BitArray`. + /// + /// # Returns + /// + /// The length of the `BitArray`. + pub fn len(&self) -> usize { + self.len + } + + /// Checks if the `BitArray` is empty. + /// + /// # Returns + /// + /// `true` if the `BitArray` is empty, `false` otherwise. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Clears the `BitArray`, setting all bits to 0. + pub fn clear(&mut self) { + self.data.iter_mut().for_each(|x| *x = 0); + } +} + +/// Writes the data to a writer in a binary format using a bit array. This function is helpfull +/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the +/// specified capacity, so memory usage is minimized. +/// +/// # Arguments +/// +/// * `data` - The data to write. +/// * `bits_per_value` - The number of bits in a single value. +/// * `max_capacity` - The maximum amount of elements that may be stored in the bit array. +/// * `writer` - The writer to write the data to. +/// +/// # Returns +/// +/// A `Result` indicating whether the write operation was successful or not. +pub fn data_to_writer( + data: Vec, + bits_per_value: usize, + max_capacity: usize, + writer: &mut impl Write +) -> Result<()> { + // Update the max capacity to be a multiple of the greatest common divisor of the bits per value + // and 64. This is done to ensure that the bit array can store the data entirely + let greates_common_divisor = gcd(bits_per_value, 64); + let capacity = + max(greates_common_divisor, max_capacity / greates_common_divisor * greates_common_divisor); + + // If amount of data is less than the max capacity, write the data to the writer in a single + // chunk + if data.len() <= capacity { + let mut bitarray = BitArray::with_capacity(data.len(), bits_per_value); + + for (i, &value) in data.iter().enumerate() { + bitarray.set(i, value as u64); + } + bitarray.write_binary(writer)?; + + return Ok(()); + } + + // Create a bit array that can store a single chunk of data + let mut bitarray = BitArray::with_capacity(capacity, bits_per_value); + + // Write the data to the writer in chunks of the specified capacity + let chunks = data.chunks_exact(capacity); + + // Store the remainder before looping over the chunks + let remainder = chunks.remainder(); + + for chunk in chunks { + for (i, &value) in chunk.iter().enumerate() { + bitarray.set(i, value as u64); + } + bitarray.write_binary(writer)?; + bitarray.clear(); + } + + // Create a new bit array with the remainder capacity + bitarray = BitArray::with_capacity(remainder.len(), bits_per_value); + + for (i, &value) in remainder.iter().enumerate() { + bitarray.set(i, value as u64); + } + bitarray.write_binary(writer)?; + + Ok(()) +} + +/// Calculates the greatest common divisor of two numbers. +/// +/// # Arguments +/// +/// * `a` - The first number. +/// * `b` - The second number. +/// +/// # Returns +/// +/// The greatest common divisor of the two numbers. +fn gcd(mut a: usize, mut b: usize) -> usize { + while b != 0 { + if b < a { + std::mem::swap(&mut b, &mut a); + } + b %= a; + } + a +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bitarray_with_capacity() { + let bitarray = BitArray::with_capacity(4, 40); + assert_eq!(bitarray.data, vec![0, 0, 0]); + assert_eq!(bitarray.mask, 0xff_ffff_ffff); + assert_eq!(bitarray.len, 4); + } + + #[test] + fn test_bitarray_get() { + let mut bitarray = BitArray::with_capacity(4, 40); + bitarray.data = vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4]; + + assert_eq!(bitarray.get(0), 0b0001110011111010110001000111111100110010); + assert_eq!(bitarray.get(1), 0b1100001001010010011000010100110111001001); + assert_eq!(bitarray.get(2), 0b1111001101001101101101101011101001010001); + assert_eq!(bitarray.get(3), 0b0000100010010001010001001110101110011100); + } + + #[test] + fn test_bitarray_set() { + let mut bitarray = BitArray::with_capacity(4, 40); + + bitarray.set(0, 0b0001110011111010110001000111111100110010); + bitarray.set(1, 0b1100001001010010011000010100110111001001); + bitarray.set(2, 0b1111001101001101101101101011101001010001); + bitarray.set(3, 0b0000100010010001010001001110101110011100); + + assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]); + } + + #[test] + fn test_bitarray_bits_per_value() { + let bitarray = BitArray::with_capacity(4, 40); + assert_eq!(bitarray.bits_per_value(), 40); + } + + #[test] + fn test_bitarray_len() { + let bitarray = BitArray::with_capacity(4, 40); + assert_eq!(bitarray.len(), 4); + } + + #[test] + fn test_bitarray_is_empty() { + let bitarray = BitArray::with_capacity(0, 40); + assert!(bitarray.is_empty()); + } + + #[test] + fn test_bitarray_is_not_empty() { + let bitarray = BitArray::with_capacity(4, 40); + assert!(!bitarray.is_empty()); + } + + #[test] + fn test_bitarray_clear() { + let mut bitarray = BitArray::with_capacity(4, 40); + bitarray.data = vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144eb9ca32eb4a4]; + + bitarray.clear(); + + assert_eq!(bitarray.data, vec![0, 0, 0]); + } + + #[test] + fn test_data_to_writer_no_chunks_needed() { + let data = vec![0x1234567890, 0xabcdef0123, 0x4567890abc, 0xdef0123456]; + let mut writer = Vec::new(); + + data_to_writer(data, 40, 2, &mut writer).unwrap(); + + assert_eq!( + writer, + vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, + 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ] + ); + } + + #[test] + fn test_data_to_writer_chunks_needed_no_remainder() { + let data = vec![ + 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, + 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, + 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, + 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, + 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, + 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, + 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, + 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, + 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, + 0x00000000, + ]; + let mut writer = Vec::new(); + + data_to_writer(data, 32, 8, &mut writer).unwrap(); + + assert_eq!( + writer, + vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, + 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, + 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, + 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, + 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, + 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, + 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, + 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, + 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, + 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, + 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, + 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, + 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, + 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff + ] + ); + } + + #[test] + fn test_data_to_writer_chunks_needed_plus_remainder() { + let data = vec![ + 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, + 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, + 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, + 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, + 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, + ]; + let mut writer = Vec::new(); + + data_to_writer(data, 32, 8, &mut writer).unwrap(); + + assert_eq!( + writer, + vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, + 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, + 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, + 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, + 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, + 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, + 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, + 0x33, 0x33, 0x33, 0x33 + ] + ); + } + + #[test] + fn test_gcd() { + assert_eq!(gcd(40, 64), 8); + assert_eq!(gcd(64, 40), 8); + assert_eq!(gcd(64, 64), 64); + assert_eq!(gcd(32, 64), 32); + } +} diff --git a/codecov.yml b/codecov.yml index 030fcbe..0a92f62 100644 --- a/codecov.yml +++ b/codecov.yml @@ -3,32 +3,52 @@ coverage: project: default: target: 90% + bitarray: + flags: + - bitarray fa-compression: - target: 90% flags: - fa-compression - sa-mappings: - target: 90% + sa-builder: flags: - - sa-mappings - patch: - default: - target: 90% - fa-compression: - target: 90% + - sa-builder + sa-compression: flags: - - fa-compression + - sa-compression + sa-index: + target: 80% + flags: + - sa-index sa-mappings: - target: 90% flags: - sa-mappings + patch: off flags: + bitarray: + paths: + - bitarray + carryforward: true fa-compression: paths: - fa-compression carryforward: true + sa-builder: + paths: + - sa-builder + carryforward: true + sa-compression: + paths: + - sa-compression + carryforward: true + sa-index: + paths: + - sa-index + carryforward: true sa-mappings: paths: - sa-mappings carryforward: true + +ignore: + - "**/main.rs" # Ignore main.rs files diff --git a/fa-compression/src/algorithm1/decode.rs b/fa-compression/src/algorithm1/decode.rs index 453f107..5295a58 100644 --- a/fa-compression/src/algorithm1/decode.rs +++ b/fa-compression/src/algorithm1/decode.rs @@ -28,9 +28,9 @@ static PREFIXES: [&str; 3] = ["EC:", "GO:", "IPR:IPR"]; /// ``` /// use fa_compression::algorithm1::decode; /// -/// let input = &[ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39 ]; +/// let input = &[ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ]; /// let result = decode(input); -/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364;IPR:IPR032635;IPR:IPR008816"); +/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364"); /// ``` pub fn decode(input: &[u8]) -> String { if input.is_empty() { @@ -82,23 +82,23 @@ mod tests { #[test] fn test_decode_single_ec() { - assert_eq!(decode(&[44, 44, 44, 189, 208]), "EC:1.1.1.-") + assert_eq!(decode(&[44, 44, 44, 190, 224]), "EC:1.1.1.-") } #[test] fn test_decode_single_go() { - assert_eq!(decode(&[209, 17, 163, 138, 208]), "GO:0009279") + assert_eq!(decode(&[225, 17, 163, 138, 224]), "GO:0009279") } #[test] fn test_decode_single_ipr() { - assert_eq!(decode(&[221, 18, 116, 117]), "IPR:IPR016364") + assert_eq!(decode(&[238, 18, 116, 117]), "IPR:IPR016364") } #[test] fn test_decode_no_ec() { assert_eq!( - decode(&[209, 17, 163, 138, 209, 39, 71, 94, 17, 153, 39]), + decode(&[225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39]), "GO:0009279;IPR:IPR016364;IPR:IPR008816" ) } @@ -106,7 +106,7 @@ mod tests { #[test] fn test_decode_no_go() { assert_eq!( - decode(&[44, 44, 44, 190, 44, 60, 44, 141, 209, 39, 71, 80]), + decode(&[44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80]), "EC:1.1.1.-;EC:1.2.1.7;IPR:IPR016364" ) } @@ -114,7 +114,7 @@ mod tests { #[test] fn test_decode_no_ipr() { assert_eq!( - decode(&[44, 44, 44, 189, 17, 26, 56, 174, 17, 26, 56, 173]), + decode(&[44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174]), "EC:1.1.1.-;GO:0009279;GO:0009279" ) } @@ -123,7 +123,7 @@ mod tests { fn test_decode_all() { assert_eq!( decode(&[ - 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39 + 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39 ]), "EC:1.1.1.-;GO:0009279;IPR:IPR016364;IPR:IPR032635;IPR:IPR008816" ) diff --git a/fa-compression/src/algorithm1/encode.rs b/fa-compression/src/algorithm1/encode.rs index 8cce9cd..0877c9a 100644 --- a/fa-compression/src/algorithm1/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -28,7 +28,7 @@ use super::{ /// let input = "IPR:IPR016364;EC:1.1.1.-;GO:0009279"; /// let encoded = encode(input); /// -/// assert_eq!(encoded, vec![ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117 ]); +/// assert_eq!(encoded, vec![ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ]); /// ``` pub fn encode(input: &str) -> Vec { if input.is_empty() { @@ -94,24 +94,24 @@ mod tests { #[test] fn test_encode_single_ec() { - assert_eq!(encode("EC:1.1.1.-"), vec![44, 44, 44, 189, 208]) + assert_eq!(encode("EC:1.1.1.-"), vec![44, 44, 44, 190, 224]) } #[test] fn test_encode_single_go() { - assert_eq!(encode("GO:0009279"), vec![209, 17, 163, 138, 208]) + assert_eq!(encode("GO:0009279"), vec![225, 17, 163, 138, 224]) } #[test] fn test_encode_single_ipr() { - assert_eq!(encode("IPR:IPR016364"), vec![221, 18, 116, 117]) + assert_eq!(encode("IPR:IPR016364"), vec![238, 18, 116, 117]) } #[test] fn test_encode_no_ec() { assert_eq!( encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), - vec![209, 17, 163, 138, 209, 39, 71, 94, 17, 153, 39] + vec![225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39] ) } @@ -119,7 +119,7 @@ mod tests { fn test_encode_no_go() { assert_eq!( encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), - vec![44, 44, 44, 190, 44, 60, 44, 141, 209, 39, 71, 80] + vec![44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80] ) } @@ -127,7 +127,7 @@ mod tests { fn test_encode_no_ipr() { assert_eq!( encode("EC:1.1.1.-;GO:0009279;GO:0009279"), - vec![44, 44, 44, 189, 17, 26, 56, 174, 17, 26, 56, 173] + vec![44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174] ) } @@ -135,7 +135,7 @@ mod tests { fn test_encode_all() { assert_eq!( encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), - vec![44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39] + vec![44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39] ) } } diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs index 3a1d9b4..a495c9e 100644 --- a/fa-compression/src/algorithm1/mod.rs +++ b/fa-compression/src/algorithm1/mod.rs @@ -73,6 +73,7 @@ enum CharacterSet { /// Special Enzyme Commission characters Dash, Point, + Preliminary, /// Different annotation type separator Comma, @@ -106,6 +107,7 @@ impl Encode for CharacterSet { b'9' => CharacterSet::Nine, b'-' => CharacterSet::Dash, b'.' => CharacterSet::Point, + b'n' => CharacterSet::Preliminary, b',' => CharacterSet::Comma, b';' => CharacterSet::Semicolon, _ => panic!("Invalid character") @@ -138,8 +140,9 @@ impl Decode for CharacterSet { 10 => '9', 11 => '-', 12 => '.', - 13 => ',', - 14 => ';', + 13 => 'n', + 14 => ',', + 15 => ';', _ => panic!("Invalid character") } } @@ -167,10 +170,12 @@ impl BitOr for CharacterSet { mod tests { use super::*; - static CHARACTERS: [u8; 15] = - [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b',', b';']; + static CHARACTERS: [u8; 16] = [ + b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', + b';' + ]; - static CHARACTER_SETS: [CharacterSet; 15] = [ + static CHARACTER_SETS: [CharacterSet; 16] = [ CharacterSet::Empty, CharacterSet::Zero, CharacterSet::One, @@ -184,6 +189,7 @@ mod tests { CharacterSet::Nine, CharacterSet::Dash, CharacterSet::Point, + CharacterSet::Preliminary, CharacterSet::Comma, CharacterSet::Semicolon ]; @@ -233,13 +239,7 @@ mod tests { #[test] #[should_panic] fn test_decode_invalid() { - CharacterSet::decode(15); - } - - #[test] - #[should_panic] - fn test_decode_pair_invalid() { - CharacterSet::decode_pair(0b11111111); + CharacterSet::decode(16); } #[test] diff --git a/sa-builder/Cargo.toml b/sa-builder/Cargo.toml index 9937384..cb7d553 100644 --- a/sa-builder/Cargo.toml +++ b/sa-builder/Cargo.toml @@ -10,3 +10,5 @@ clap = { version = "4.4.8", features = ["derive"] } libsais64-rs = { path = "../libsais64-rs" } libdivsufsort-rs = "0.1.0" sa-mappings = { path = "../sa-mappings" } +sa-compression = { path = "../sa-compression" } +sa-index = { path = "../sa-index" } diff --git a/sa-builder/README.md b/sa-builder/README.md new file mode 100644 index 0000000..5f51b07 --- /dev/null +++ b/sa-builder/README.md @@ -0,0 +1,28 @@ +Suffix Array Builder +==================== + +A rust implementation to build large generalized suffix arrays. + +# Usage + +```plain +Build a (sparse, compressed) suffix array from the given text + +Usage: sa-builder [OPTIONS] --database-file --taxonomy --output + +Options: + -d, --database-file + File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a hashtag `#` + -t, --taxonomy + The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy + -o, --output + Output location where to store the suffix array + -s, --sparseness-factor + The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used) [default: 1] + -a, --construction-algorithm + The algorithm used to construct the suffix array (default value LibSais) [default: lib-sais] [possible values: lib-div-suf-sort, lib-sais] + -c, --compress-sa + If the suffix array should be compressed (default value true) + -h, --help + Print help +``` diff --git a/sa-builder/src/binary.rs b/sa-builder/src/binary.rs deleted file mode 100644 index ad417d3..0000000 --- a/sa-builder/src/binary.rs +++ /dev/null @@ -1,149 +0,0 @@ -use std::{ - cmp::min, - error::Error, - fs::{ - File, - OpenOptions - }, - io::{ - Read, - Write - } -}; - -const ONE_GIB: usize = 2usize.pow(30); - -/// Trait implemented by structs that are binary serializable -/// In our case this is will be a [i64] since the suffix array is a Vec -pub trait Serializable { - /// Serializes self into a vector of bytes - /// - /// # Returns - /// - /// Returns a vector of bytes - fn serialize(&self) -> Vec; -} - -impl Serializable for [i64] { - fn serialize(&self) -> Vec { - let mut res = vec![]; - self.iter() - .for_each(|entry| res.extend_from_slice(&entry.to_le_bytes())); - res - } -} - -/// Deserializes a vector of bytes into the suffix array -/// -/// # Arguments -/// * `data` - The raw bytes needed to be serialized into a suffix array -/// -/// # Returns -/// -/// Returns the suffix array, a Vec -fn deserialize_sa(data: &[u8]) -> Vec { - let mut res = vec![]; - if data.len() % 8 != 0 { - panic!("Serialized data is not a multiple of 8 bytes long!") - } - for start in (0 .. data.len()).step_by(8) { - res.push(i64::from_le_bytes(data[start .. start + 8].try_into().unwrap())); - } - res -} - -/// Writes the given suffix array with the `sparseness_factor` factor to the given file -/// -/// # Arguments -/// * `sparseness_factor` - The sparseness factor of the suffix array -/// * `suffix_array` - The suffix array -/// * `filename` - The name of the file we want to write the suffix array to -/// -/// # Returns -/// -/// Returns () if writing away the suffix array succeeded -/// -/// # Errors -/// -/// Returns an io::Error if writing away the suffix array failed -pub fn write_suffix_array( - sparseness_factor: u8, - suffix_array: &[i64], - filename: &str -) -> Result<(), std::io::Error> { - // create the file - let mut f = OpenOptions::new() - .create(true) - .write(true) - .truncate(true) // if the file already exists, empty the file - .open(filename)?; - f.write_all(&[sparseness_factor])?; // write the sample rate as the first byte - - // write 1 GiB at a time, to minimize extra used memory since we need to translate i64 to [u8; - // 8] - let sa_len = suffix_array.len(); - for start_index in (0 .. sa_len).step_by(ONE_GIB / 8) { - let end_index = min(start_index + ONE_GIB / 8, sa_len); - f.write_all(&suffix_array[start_index .. end_index].serialize())?; - } - - Ok(()) -} - -/// Loads the suffix array from the file with the given `filename` -/// -/// # Arguments -/// * `filename` - The filename of the file where the suffix array is stored -/// -/// # Returns -/// -/// Returns the sample rate of the suffix array, together with the suffix array -/// -/// # Errors -/// -/// Returns any error from opening the file or reading the file -pub fn load_suffix_array(filename: &str) -> Result<(u8, Vec), Box> { - let mut file = &File::open(filename)?; - let mut sparseness_factor_buffer = [0_u8; 1]; - file.read_exact(&mut sparseness_factor_buffer) - .map_err(|_| "Could not read the sample rate from the binary file")?; - let sparseness_factor = sparseness_factor_buffer[0]; - - let mut sa = vec![]; - loop { - let mut buffer = vec![]; - // use take in combination with read_to_end to ensure that the buffer will be completely - // filled (except when the file is smaller than the buffer) - let count = file.take(ONE_GIB as u64).read_to_end(&mut buffer)?; - if count == 0 { - break; - } - sa.extend_from_slice(&deserialize_sa(&buffer[.. count])); - } - - Ok((sparseness_factor, sa)) -} - -#[cfg(test)] -mod tests { - use crate::binary::{ - deserialize_sa, - Serializable - }; - - #[test] - fn test_serialize_deserialize() { - let data: Vec = vec![5, 2165487362, -12315135]; - let serialized = data.serialize(); - let deserialized = deserialize_sa(serialized.as_ref()); - assert_eq!(data, deserialized); - } - - #[test] - fn test_serialize_deserialize_empty() { - let data: Vec = vec![]; - let serialized = data.serialize(); - let deserialized = deserialize_sa(serialized.as_ref()); - assert_eq!(data, deserialized); - } -} diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 65092e5..98ac63d 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,5 +1,3 @@ -pub mod binary; - use std::error::Error; use clap::{ @@ -7,25 +5,29 @@ use clap::{ ValueEnum }; -/// Enum that represents all possible commandline arguments +/// Build a (sparse, compressed) suffix array from the given text #[derive(Parser, Debug)] pub struct Arguments { /// File with the proteins used to build the suffix tree. All the proteins are expected to be - /// concatenated using a `#`. + /// concatenated using a hashtag `#`. #[arg(short, long)] pub database_file: String, - #[arg(short, long)] /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. + #[arg(short, long)] pub taxonomy: String, - /// Output file to store the built index. + /// Output location where to store the suffix array #[arg(short, long)] pub output: String, /// The sparseness_factor used on the suffix array (default value 1, which means every value in /// the SA is used) - #[arg(long, default_value_t = 1)] + #[arg(short, long, default_value_t = 1)] pub sparseness_factor: u8, - #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] - pub construction_algorithm: SAConstructionAlgorithm + /// The algorithm used to construct the suffix array (default value LibSais) + #[arg(short('a'), long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] + pub construction_algorithm: SAConstructionAlgorithm, + /// If the suffix array should be compressed (default value true) + #[arg(short, long, default_value_t = false)] + pub compress_sa: bool } /// Enum representing the two possible algorithms to construct the suffix array @@ -35,51 +37,185 @@ pub enum SAConstructionAlgorithm { LibSais } -/// Gets the current time in ms +/// Build a sparse suffix array from the given text /// /// # Arguments -/// * `data` - The text on which we want to build the suffix array +/// * `text` - The text on which we want to build the suffix array /// * `construction_algorithm` - The algorithm used during construction /// * `sparseness_factor` - The sparseness factor used on the suffix array /// /// # Returns /// -/// Returns the constructed suffix array +/// Returns the constructed (sparse) suffix array /// /// # Errors /// /// The errors that occurred during the building of the suffix array itself -pub fn build_sa( - data: &mut Vec, +pub fn build_ssa( + text: &mut Vec, construction_algorithm: &SAConstructionAlgorithm, sparseness_factor: u8 ) -> Result, Box> { // translate all L's to a I - for character in data.iter_mut() { + translate_l_to_i(text); + + // Build the suffix array using the selected algorithm + let mut sa = match construction_algorithm { + SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(text), + SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) + } + .ok_or("Building suffix array failed")?; + + // make the SA sparse and decrease the vector size if we have sampling (sampling_rate > 1) + sample_sa(&mut sa, sparseness_factor); + + Ok(sa) +} + +/// Translate all L's to I's in the given text +/// +/// # Arguments +/// * `text` - The text in which we want to translate the L's to I's +/// +/// # Returns +/// +/// The text with all L's translated to I's +fn translate_l_to_i(text: &mut [u8]) { + for character in text.iter_mut() { if *character == b'L' { *character = b'I' } } +} - let mut sa = match construction_algorithm { - SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(data), - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(data) +/// Sample the suffix array with the given sparseness factor +/// +/// # Arguments +/// * `sa` - The suffix array that we want to sample +/// * `sparseness_factor` - The sparseness factor used for sampling +/// +/// # Returns +/// +/// The sampled suffix array +fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { + if sparseness_factor <= 1 { + return; } - .ok_or("Building suffix array failed")?; - // make the SA sparse and decrease the vector size if we have sampling (== sampling_rate > 1) - if sparseness_factor > 1 { - let mut current_sampled_index = 0; - for i in 0 .. sa.len() { - let current_sa_val = sa[i]; - if current_sa_val % sparseness_factor as i64 == 0 { - sa[current_sampled_index] = current_sa_val; - current_sampled_index += 1; - } + let mut current_sampled_index = 0; + for i in 0 .. sa.len() { + let current_sa_val = sa[i]; + if current_sa_val % sparseness_factor as i64 == 0 { + sa[current_sampled_index] = current_sa_val; + current_sampled_index += 1; } - // make shorter - sa.resize(current_sampled_index, 0); } - Ok(sa) + // make shorter + sa.resize(current_sampled_index, 0); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_arguments() { + let args = Arguments::parse_from(&[ + "sa-builder", + "--database-file", + "database.fa", + "--taxonomy", + "taxonomy.tsv", + "--output", + "output.fa", + "--sparseness-factor", + "2", + "--construction-algorithm", + "lib-div-suf-sort", + "--compress-sa" + ]); + + assert_eq!(args.database_file, "database.fa"); + assert_eq!(args.taxonomy, "taxonomy.tsv"); + assert_eq!(args.output, "output.fa"); + assert_eq!(args.sparseness_factor, 2); + assert_eq!(args.construction_algorithm, SAConstructionAlgorithm::LibDivSufSort); + assert_eq!(args.compress_sa, true); + } + + #[test] + fn test_sa_construction_algorithm() { + assert_eq!( + SAConstructionAlgorithm::from_str("lib-div-suf-sort", false), + Ok(SAConstructionAlgorithm::LibDivSufSort) + ); + assert_eq!( + SAConstructionAlgorithm::from_str("lib-sais", false), + Ok(SAConstructionAlgorithm::LibSais) + ); + } + + #[test] + fn test_build_ssa_libsais() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); + } + + #[test] + fn test_build_ssa_libsais_empty() { + let mut text = b"".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 1).unwrap(); + assert_eq!(sa, vec![]); + } + + #[test] + fn test_build_ssa_libsais_sparse() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibSais, 2).unwrap(); + assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); + } + + #[test] + fn test_build_ssa_libdivsufsort() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + assert_eq!(sa, vec![11, 10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]); + } + + #[test] + fn test_build_ssa_libdivsufsort_empty() { + let mut text = b"".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 1).unwrap(); + assert_eq!(sa, vec![]); + } + + #[test] + fn test_build_ssa_libdivsufsort_sparse() { + let mut text = b"ABRACADABRA$".to_vec(); + let sa = build_ssa(&mut text, &SAConstructionAlgorithm::LibDivSufSort, 2).unwrap(); + assert_eq!(sa, vec![10, 0, 8, 4, 6, 2]); + } + + #[test] + fn test_translate_l_to_i() { + let mut text = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ$-".to_vec(); + translate_l_to_i(&mut text); + assert_eq!(text, b"ABCDEFGHIJKIMNOPQRSTUVWXYZ$-".to_vec()); + } + + #[test] + fn test_sample_sa_1() { + let mut sa = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + sample_sa(&mut sa, 1); + assert_eq!(sa, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + } + + #[test] + fn test_sample_sa_2() { + let mut sa = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + sample_sa(&mut sa, 2); + assert_eq!(sa, vec![0, 2, 4, 6, 8]); + } } diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index e651673..812bd7d 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,9 +1,18 @@ +use std::{ + fs::{ + File, + OpenOptions + }, + io::Result +}; + use clap::Parser; use sa_builder::{ - binary::write_suffix_array, - build_sa, + build_ssa, Arguments }; +use sa_compression::dump_compressed_suffix_array; +use sa_index::binary::dump_suffix_array; use sa_mappings::{ proteins::Proteins, taxonomy::{ @@ -13,42 +22,75 @@ use sa_mappings::{ }; fn main() { - let args = Arguments::parse(); let Arguments { database_file, taxonomy, output, sparseness_factor, - construction_algorithm - } = args; + construction_algorithm, + compress_sa + } = Arguments::parse(); + + eprintln!(); + eprintln!("📋 Started loading the taxon file..."); let taxon_id_calculator = - TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar); - if let Err(err) = taxon_id_calculator { - eprintln!("{}", err); - std::process::exit(1); - } + TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("✅ Successfully loaded the taxon file!"); + eprintln!("\tAggregation method: LCA*"); - let taxon_id_calculator = taxon_id_calculator.unwrap(); + eprintln!(); + eprintln!("📋 Started loading the proteins..."); + let mut data = + Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("✅ Successfully loaded the proteins!"); - // read input - let data = - Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator); - if let Err(err) = data { - eprintln!("{}", err); - std::process::exit(1); - } - let mut data = data.unwrap(); - // calculate sa - let sa = build_sa(&mut data, &construction_algorithm, sparseness_factor); - if let Err(err) = sa { - eprintln!("{}", err); - std::process::exit(1); + eprintln!(); + eprintln!("📋 Started building the suffix array..."); + let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("✅ Successfully built the suffix array!"); + eprintln!("\tAmount of items: {}", sa.len()); + eprintln!("\tSample rate: {}", sparseness_factor); + + // open the output file + let mut file = + open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + + eprintln!(); + eprintln!("📋 Started dumping the suffix array..."); + + if compress_sa { + let bits_per_value = (data.len() as f64).log2().ceil() as usize; + + if let Err(err) = + dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) + { + eprint_and_exit(err.to_string().as_str()); + }; + + eprintln!("✅ Successfully dumped the suffix array!"); + eprintln!("\tAmount of bits per item: {}", bits_per_value); + } else { + if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { + eprint_and_exit(err.to_string().as_str()); + } + + eprintln!("✅ Successfully dumped the suffix array!"); + eprintln!("\tAmount of bits per item: 64"); } - let sa = sa.unwrap(); +} + +fn open_file(file: &str) -> Result { + OpenOptions::new() + .create(true) + .write(true) + .truncate(true) // if the file already exists, empty the file + .open(file) +} - // output the build SA - if let Err(err) = write_suffix_array(sparseness_factor, &sa, &output) { - eprintln!("{}", err); - std::process::exit(1); - }; +fn eprint_and_exit(err: &str) -> ! { + eprintln!("{}", err); + std::process::exit(1); } diff --git a/sa-compression/Cargo.toml b/sa-compression/Cargo.toml new file mode 100644 index 0000000..70a6cbf --- /dev/null +++ b/sa-compression/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "sa-compression" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitarray = { path = "../bitarray" } +sa-index = { path = "../sa-index" } diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs new file mode 100644 index 0000000..85a41df --- /dev/null +++ b/sa-compression/src/lib.rs @@ -0,0 +1,271 @@ +use std::{ + error::Error, + io::{ + BufRead, + Write + } +}; + +use bitarray::{ + data_to_writer, + Binary, + BitArray +}; +use sa_index::SuffixArray; + +/// Writes the compressed suffix array to a writer. +/// +/// # Arguments +/// +/// * `sa` - The suffix array to be compressed. +/// * `sparseness_factor` - The sparseness factor used for compression. +/// * `bits_per_value` - The number of bits used to represent each value in the compressed array. +/// * `writer` - The writer to which the compressed array will be written. +/// +/// # Errors +/// +/// Returns an error if writing to the writer fails. +pub fn dump_compressed_suffix_array( + sa: Vec, + sparseness_factor: u8, + bits_per_value: usize, + writer: &mut impl Write +) -> Result<(), Box> { + // Write the flags to the writer + // 00000001 indicates that the suffix array is compressed + writer + .write(&[bits_per_value as u8]) + .map_err(|_| "Could not write the required bits to the writer")?; + + // Write the sparseness factor to the writer + writer + .write(&[sparseness_factor]) + .map_err(|_| "Could not write the sparseness factor to the writer")?; + + // Write the size of the suffix array to the writer + writer + .write(&(sa.len() as u64).to_le_bytes()) + .map_err(|_| "Could not write the size of the suffix array to the writer")?; + + // Compress the suffix array and write it to the writer + data_to_writer(sa, bits_per_value, 8 * 1024, writer) + .map_err(|_| "Could not write the compressed suffix array to the writer")?; + + Ok(()) +} + +/// Load the compressed suffix array from a reader. +/// +/// # Arguments +/// +/// * `reader` - The reader from which the compressed array will be read. +/// * `bits_per_value` - The number of bits used to represent each value in the compressed array. +/// +/// # Errors +/// +/// Returns an error if reading from the reader fails. +pub fn load_compressed_suffix_array( + reader: &mut impl BufRead, + bits_per_value: usize +) -> Result> { + // Read the sample rate from the binary file (1 byte) + let mut sample_rate_buffer = [0_u8; 1]; + reader + .read_exact(&mut sample_rate_buffer) + .map_err(|_| "Could not read the sample rate from the binary file")?; + let sample_rate = sample_rate_buffer[0]; + + // Read the size of the suffix array from the binary file (8 bytes) + let mut size_buffer = [0_u8; 8]; + reader + .read_exact(&mut size_buffer) + .map_err(|_| "Could not read the size of the suffix array from the binary file")?; + let size = u64::from_le_bytes(size_buffer) as usize; + + // Read the compressed suffix array from the binary file + let mut compressed_suffix_array = BitArray::with_capacity(size, bits_per_value); + compressed_suffix_array + .read_binary(reader) + .map_err(|_| "Could not read the compressed suffix array from the binary file")?; + + Ok(SuffixArray::Compressed(compressed_suffix_array, sample_rate)) +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + + pub struct FailingWriter { + /// The number of times the write function can be called before it fails. + pub valid_write_count: usize + } + + impl Write for FailingWriter { + fn write(&mut self, _: &[u8]) -> Result { + if self.valid_write_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed")); + } + + self.valid_write_count -= 1; + Ok(1) + } + + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + + pub struct FailingReader { + /// The number of times the read function can be called before it fails. + pub valid_read_count: usize + } + + impl Read for FailingReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.valid_read_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed")); + } + + self.valid_read_count -= 1; + Ok(buf.len()) + } + } + + impl BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Ok(&[]) + } + + fn consume(&mut self, _: usize) {} + } + + #[test] + fn test_dump_compressed_suffix_array() { + let sa = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + + let mut writer = vec![]; + dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); + + assert_eq!( + writer, + vec![ + // bits per value + 8, // sparseness factor + 1, // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 + ] + ); + } + + #[test] + #[should_panic(expected = "Could not write the required bits to the writer")] + fn test_dump_compressed_suffix_array_fail_required_bits() { + let mut writer = FailingWriter { + valid_write_count: 0 + }; + + dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the sparseness factor to the writer")] + fn test_dump_compressed_suffix_array_fail_sparseness_factor() { + let mut writer = FailingWriter { + valid_write_count: 1 + }; + + dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the size of the suffix array to the writer")] + fn test_dump_compressed_suffix_array_fail_size() { + let mut writer = FailingWriter { + valid_write_count: 2 + }; + + dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the compressed suffix array to the writer")] + fn test_dump_compressed_suffix_array_fail_compressed_suffix_array() { + let mut writer = FailingWriter { + valid_write_count: 3 + }; + + dump_compressed_suffix_array(vec![1], 1, 8, &mut writer).unwrap(); + } + + #[test] + fn test_load_compressed_suffix_array() { + let data = vec![ + // sparseness factor + 1, // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9, + ]; + + let mut reader = std::io::BufReader::new(&data[..]); + let compressed_suffix_array = load_compressed_suffix_array(&mut reader, 8).unwrap(); + + assert_eq!(compressed_suffix_array.sample_rate(), 1); + for i in 0 .. 10 { + assert_eq!(compressed_suffix_array.get(i), i as i64 + 1); + } + } + + #[test] + #[should_panic(expected = "Could not read the sample rate from the binary file")] + fn test_load_compressed_suffix_array_fail_sample_rate() { + let mut reader = FailingReader { + valid_read_count: 0 + }; + + load_compressed_suffix_array(&mut reader, 8).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the size of the suffix array from the binary file")] + fn test_load_compressed_suffix_array_fail_size() { + let mut reader = FailingReader { + valid_read_count: 1 + }; + + load_compressed_suffix_array(&mut reader, 8).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the compressed suffix array from the binary file")] + fn test_load_compressed_suffix_array_fail_compressed_suffix_array() { + let mut reader = FailingReader { + valid_read_count: 2 + }; + + load_compressed_suffix_array(&mut reader, 8).unwrap(); + } + + #[test] + fn test_failing_writer() { + let mut writer = FailingWriter { + valid_write_count: 0 + }; + assert!(writer.flush().is_ok()); + assert!(writer.write(&[0]).is_err()); + } + + #[test] + fn test_failing_reader() { + let mut reader = FailingReader { + valid_read_count: 0 + }; + let right_buffer: [u8; 0] = []; + assert_eq!(reader.fill_buf().unwrap(), &right_buffer); + assert_eq!(reader.consume(0), ()); + let mut buffer = [0_u8; 1]; + assert!(reader.read(&mut buffer).is_err()); + } +} diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index 70acb67..557549c 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -7,12 +7,13 @@ edition = "2021" [dev-dependencies] tempdir = "0.3.7" +fa-compression = { path = "../fa-compression" } [dependencies] clap = { version = "4.4.8", features = ["derive"] } umgap = "1.1.0" rayon = "1.8.1" serde = { version = "1.0.197", features = ["derive"] } -sa-builder = { path = "../sa-builder" } sa-mappings = { path = "../sa-mappings" } +bitarray = { path = "../bitarray" } serde_json = "1.0.116" diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs new file mode 100644 index 0000000..5688d4a --- /dev/null +++ b/sa-index/src/binary.rs @@ -0,0 +1,416 @@ +use std::{ + error::Error, + io::{ + BufRead, + Read, + Write + } +}; + +use crate::SuffixArray; + +/// The `Binary` trait provides methods for reading and writing a struct as binary. +pub trait Binary { + /// Writes the struct as binary to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. + fn write_binary(&self, writer: &mut W) -> std::io::Result<()>; + + /// Reads binary data into a struct from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader to read the binary data from. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `Err` if an error occurs. + fn read_binary(&mut self, reader: R) -> std::io::Result<()>; +} + +/// Implements the `Binary` trait for `Vec`. +impl Binary for Vec { + /// Writes the elements of the vector to a binary file. + /// + /// # Arguments + /// + /// * `writer` - The writer to which the binary data will be written. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `std::io::Error` otherwise. + fn write_binary(&self, writer: &mut W) -> std::io::Result<()> { + for value in self { + writer.write_all(&value.to_le_bytes())?; + } + + Ok(()) + } + + /// Reads binary data from a reader and populates the vector with the read values. + /// + /// # Arguments + /// + /// * `reader` - The reader from which the binary data will be read. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `std::io::Error` otherwise. + fn read_binary(&mut self, mut reader: R) -> std::io::Result<()> { + self.clear(); + + let mut buffer = vec![0; 8 * 1024]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; + for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { + self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); + } + + if finished { + break; + } + } + + Ok(()) + } +} + +/// Writes the suffix array to a binary file. +/// +/// # Arguments +/// +/// * `sa` - The suffix array to dump. +/// * `sparseness_factor` - The sparseness factor to write to the file. +/// * `writer` - The writer to write the binary data to. +/// +/// # Returns +/// +/// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. +pub fn dump_suffix_array( + sa: &Vec, + sparseness_factor: u8, + writer: &mut impl Write +) -> Result<(), Box> { + // Write the required bits to the writer + // 01000000 indicates that the suffix array is not compressed + writer + .write(&[64_u8]) + .map_err(|_| "Could not write the required bits to the writer")?; + + // Write the sparseness factor to the writer + writer + .write(&[sparseness_factor]) + .map_err(|_| "Could not write the sparseness factor to the writer")?; + + // Write the size of the suffix array to the writer + let sa_len = sa.len(); + writer + .write(&(sa_len).to_le_bytes()) + .map_err(|_| "Could not write the size of the suffix array to the writer")?; + + // Write the suffix array to the writer + sa.write_binary(writer) + .map_err(|_| "Could not write the suffix array to the writer")?; + + Ok(()) +} + +/// Loads the suffix array from the file with the given `filename` +/// +/// # Arguments +/// * `filename` - The filename of the file where the suffix array is stored +/// +/// # Returns +/// +/// Returns the sample rate of the suffix array, together with the suffix array +/// +/// # Errors +/// +/// Returns any error from opening the file or reading the file +pub fn load_suffix_array(reader: &mut impl BufRead) -> Result> { + // Read the sample rate from the binary file (1 byte) + let mut sample_rate_buffer = [0_u8; 1]; + reader + .read_exact(&mut sample_rate_buffer) + .map_err(|_| "Could not read the sample rate from the binary file")?; + let sample_rate = sample_rate_buffer[0]; + + // Read the size of the suffix array from the binary file (8 bytes) + let mut size_buffer = [0_u8; 8]; + reader + .read_exact(&mut size_buffer) + .map_err(|_| "Could not read the size of the suffix array from the binary file")?; + let size = u64::from_le_bytes(size_buffer) as usize; + + let mut sa = Vec::with_capacity(size); + sa.read_binary(reader) + .map_err(|_| "Could not read the suffix array from the binary file")?; + + Ok(SuffixArray::Original(sa, sample_rate)) +} + +/// Fills the buffer with data read from the input. +/// +/// # Arguments +/// +/// * `input` - The input source to read data from. +/// * `buffer` - The buffer to fill with data. +/// +/// # Returns +/// +/// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input +/// is reached, and `bytes_read` is the number of bytes read into the buffer. +fn fill_buffer(input: &mut T, buffer: &mut Vec) -> std::io::Result<(bool, usize)> { + // Store the buffer size in advance, because rust will complain + // about the buffer being borrowed mutably while it's borrowed + let buffer_size = buffer.len(); + + let mut writable_buffer_space = buffer.as_mut(); + + loop { + match input.read(writable_buffer_space) { + // No bytes written, which means we've completely filled the buffer + // or we've reached the end of the file + Ok(0) => { + return Ok(( + !writable_buffer_space.is_empty(), + buffer_size - writable_buffer_space.len() + )); + } + + // We've read {bytes_read} bytes + Ok(bytes_read) => { + // Shrink the writable buffer slice + writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); + } + + // An error occurred while reading + Err(e) => { + return Err(e); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + pub struct FailingWriter { + /// The number of times the write function can be called before it fails. + pub valid_write_count: usize + } + + impl Write for FailingWriter { + fn write(&mut self, _: &[u8]) -> Result { + if self.valid_write_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed")); + } + + self.valid_write_count -= 1; + Ok(1) + } + + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + + pub struct FailingReader { + /// The number of times the read function can be called before it fails. + pub valid_read_count: usize + } + + impl Read for FailingReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.valid_read_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed")); + } + + self.valid_read_count -= 1; + Ok(buf.len()) + } + } + + impl BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Ok(&[]) + } + + fn consume(&mut self, _: usize) {} + } + + #[test] + fn test_fill_buffer() { + let input_str = "a".repeat(8_000); + let mut input = input_str.as_bytes(); + + let mut buffer = vec![0; 800]; + + loop { + let (finished, bytes_read) = fill_buffer(&mut input, &mut buffer).unwrap(); + + if finished { + assert!(bytes_read < 800); + break; + } else { + assert_eq!(bytes_read, 800); + } + } + } + + #[test] + fn test_fill_buffer_read_error() { + let mut input = FailingReader { + valid_read_count: 0 + }; + let mut buffer = vec![0; 800]; + + assert!(fill_buffer(&mut input, &mut buffer).is_err()); + } + + #[test] + fn test_write_binary() { + let mut buffer = Vec::new(); + let values = vec![1, 2, 3, 4, 5]; + + values.write_binary(&mut buffer).unwrap(); + + assert_eq!( + buffer, + vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0 + ] + ); + } + + #[test] + fn test_read_binary() { + let buffer = vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, + 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + ]; + + let mut values = Vec::new(); + values.read_binary(buffer.as_slice()).unwrap(); + + assert_eq!(values, vec![1, 2, 3, 4, 5]); + } + + #[test] + fn test_dump_suffix_array() { + let mut buffer = Vec::new(); + let sa = vec![1, 2, 3, 4, 5]; + + dump_suffix_array(&sa, 1, &mut buffer).unwrap(); + + assert_eq!( + buffer, + vec![ + // required bits + 64, // Sparseness factor + 1, // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0 + ] + ); + } + + #[test] + #[should_panic(expected = "Could not write the required bits to the writer")] + fn test_dump_suffix_array_fail_required_bits() { + let mut writer = FailingWriter { + valid_write_count: 0 + }; + + dump_suffix_array(&vec![], 1, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the sparseness factor to the writer")] + fn test_dump_suffix_array_fail_sparseness_factor() { + let mut writer = FailingWriter { + valid_write_count: 1 + }; + + dump_suffix_array(&vec![], 1, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the size of the suffix array to the writer")] + fn test_dump_suffix_array_fail_size() { + let mut writer = FailingWriter { + valid_write_count: 2 + }; + + dump_suffix_array(&vec![], 1, &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the suffix array to the writer")] + fn test_dump_suffix_array_fail_suffix_array() { + let mut writer = FailingWriter { + valid_write_count: 3 + }; + + dump_suffix_array(&vec![1], 1, &mut writer).unwrap(); + } + + #[test] + fn test_load_suffix_array() { + let buffer = vec![ + // Sample rate + 1, // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, + 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + ]; + + let mut reader = buffer.as_slice(); + let sa = load_suffix_array(&mut reader).unwrap(); + + assert_eq!(sa.sample_rate(), 1); + for i in 0 .. 5 { + assert_eq!(sa.get(i), i as i64 + 1); + } + } + + #[test] + #[should_panic(expected = "Could not read the sample rate from the binary file")] + fn test_load_suffix_array_fail_sample_rate() { + let mut reader = FailingReader { + valid_read_count: 0 + }; + + load_suffix_array(&mut reader).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the size of the suffix array from the binary file")] + fn test_load_suffix_array_fail_size() { + let mut reader = FailingReader { + valid_read_count: 1 + }; + + load_suffix_array(&mut reader).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the suffix array from the binary file")] + fn test_load_suffix_array_fail_suffix_array() { + let mut reader = FailingReader { + valid_read_count: 2 + }; + + load_suffix_array(&mut reader).unwrap(); + } +} diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index b017880..f276906 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -1,248 +1,94 @@ -use std::{ - error::Error, - num::NonZeroUsize -}; - -use clap::{ - arg, - Parser, - ValueEnum -}; -use sa_builder::{ - binary::{ - load_suffix_array, - write_suffix_array - }, - build_sa, - SAConstructionAlgorithm -}; -use sa_mappings::{ - functionality::FunctionAggregator, - proteins::Proteins, - taxonomy::{ - AggregationMethod, - TaxonAggregator - } -}; - -use crate::{ - peptide_search::{ - analyse_all_peptides, - search_all_peptides - }, - sa_searcher::Searcher, - suffix_to_protein_index::{ - DenseSuffixToProtein, - SparseSuffixToProtein, - SuffixToProteinIndex, - SuffixToProteinMappingStyle - }, - util::{ - get_time_ms, - read_lines - } -}; +use bitarray::BitArray; +pub mod binary; pub mod peptide_search; pub mod sa_searcher; pub mod suffix_to_protein_index; -pub mod util; -/// Enum that represents the 2 kinds of search that are supported -#[derive(ValueEnum, Clone, Debug, PartialEq)] -pub enum SearchMode { - Search, - Analysis +/// Represents a suffix array. +pub enum SuffixArray { + /// The original suffix array. + Original(Vec, u8), + /// The compressed suffix array. + Compressed(BitArray, u8) } -/// Enum that represents all possible commandline arguments -#[derive(Parser, Debug)] -pub struct Arguments { - /// File with the proteins used to build the suffix tree. All the proteins are expected to be - /// concatenated using a `#`. - #[arg(short, long)] - database_file: String, - #[arg(short, long)] - search_file: Option, - #[arg(short, long)] - /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. - taxonomy: String, - /// This will only build the tree and stop after that is completed. Used during benchmarking. - #[arg(long)] - build_only: bool, - /// Output file to store the built index. - #[arg(short, long)] - output: Option, - /// The sparseness factor used on the suffix array (default value 1, which means every value in - /// the SA is used) - #[arg(long, default_value_t = 1)] - sparseness_factor: u8, - /// Set the style used to map back from the suffix to the protein. 2 options or - /// . Dense is default Dense uses O(n) memory with n the size of the input text, and - /// takes O(1) time to find the mapping Sparse uses O(m) memory with m the number of - /// proteins, and takes O(log m) to find the mapping - #[arg(long, value_enum, default_value_t = SuffixToProteinMappingStyle::Sparse)] - suffix_to_protein_mapping: SuffixToProteinMappingStyle, - #[arg(long)] - load_index: Option, - #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] - construction_algorithm: SAConstructionAlgorithm, - /// Assume the resulting taxon ID is root (1) whenever a peptide matches >= cutoff proteins - #[arg(long, default_value_t = 10000)] - cutoff: usize, - #[arg(long)] - threads: Option, - #[arg(long)] - equalize_i_and_l: bool, - #[arg(long)] - clean_taxa: bool, - #[arg(long, value_enum, default_value_t = SearchMode::Analysis)] - search_mode: SearchMode -} - -/// Run the suffix array program -/// -/// # Arguments -/// * `args` - The commandline arguments provided to the program -/// -/// # Returns -/// -/// Returns Unit -/// -/// # Errors -/// -/// Returns all possible errors that occurred during the program -pub fn run(mut args: Arguments) -> Result<(), Box> { - let taxon_id_calculator = - TaxonAggregator::try_from_taxonomy_file(&args.taxonomy, AggregationMethod::LcaStar)?; - - let sa = match &args.load_index { - // load SA from file - Some(index_file_name) => { - let (sparseness_factor, sa) = load_suffix_array(index_file_name)?; - args.sparseness_factor = sparseness_factor; - // TODO: some kind of security check that the loaded database file and SA match - sa - } - // build the SA - None => { - let protein_sequences = - Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?; - build_sa( - &mut protein_sequences.input_string.clone(), - &args.construction_algorithm, - args.sparseness_factor - )? +impl SuffixArray { + /// Returns the length of the suffix array. + /// + /// # Returns + /// + /// The length of the suffix array. + pub fn len(&self) -> usize { + match self { + SuffixArray::Original(sa, _) => sa.len(), + SuffixArray::Compressed(sa, _) => sa.len() } - }; - - let proteins = Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?; - - if let Some(output) = &args.output { - write_suffix_array(args.sparseness_factor, &sa, output)?; } - // option that only builds the tree, but does not allow for querying (easy for benchmark - // purposes) - if args.build_only { - return Ok(()); + /// Returns the number of bits per value in the suffix array. + /// + /// # Returns + /// + /// The number of bits per value in the suffix array. + pub fn bits_per_value(&self) -> usize { + match self { + SuffixArray::Original(_, _) => 64, + SuffixArray::Compressed(sa, _) => sa.bits_per_value() + } } - // build the right mapping index, use box to be able to store both types in this variable - let suffix_index_to_protein: Box = - match args.suffix_to_protein_mapping { - SuffixToProteinMappingStyle::Dense => { - Box::new(DenseSuffixToProtein::new(&proteins.input_string)) - } - SuffixToProteinMappingStyle::Sparse => { - Box::new(SparseSuffixToProtein::new(&proteins.input_string)) - } - }; - - let functional_aggregator = FunctionAggregator {}; - - let searcher = Searcher::new( - sa, - args.sparseness_factor, - suffix_index_to_protein, - proteins, - taxon_id_calculator, - functional_aggregator - ); - - execute_search(&searcher, &args)?; - Ok(()) -} - -/// Execute the search using the provided programs -/// -/// # Arguments -/// * `searcher` - The Searcher which contains the protein database -/// * `args` - The arguments used to start the program -/// -/// # Returns -/// -/// Returns Unit -/// -/// # Errors -/// -/// Returns possible errors that occurred during search -fn execute_search(searcher: &Searcher, args: &Arguments) -> Result<(), Box> { - let cutoff = args.cutoff; - let search_file = args - .search_file - .as_ref() - .ok_or("No peptide file provided to search in the database")?; - - let start_time = get_time_ms()?; - let lines = read_lines(search_file)?; - let all_peptides: Vec = lines.map_while(Result::ok).collect(); - - // Explicitly set the number of threads to use if the commandline argument was set - if let Some(threads) = args.threads { - rayon::ThreadPoolBuilder::new() - .num_threads(threads.get()) - .build_global()?; + /// Returns the sample rate used for the suffix array. + /// + /// # Returns + /// + /// The sample rate used for the suffix array. + pub fn sample_rate(&self) -> u8 { + match self { + SuffixArray::Original(_, sample_rate) => *sample_rate, + SuffixArray::Compressed(_, sample_rate) => *sample_rate + } } - match args.search_mode { - SearchMode::Search => { - let search_result = search_all_peptides( - searcher, - &all_peptides, - cutoff, - args.equalize_i_and_l, - args.clean_taxa - ); - println!("{}", serde_json::to_string(&search_result)?); - } - SearchMode::Analysis => { - let search_result = analyse_all_peptides( - searcher, - &all_peptides, - cutoff, - args.equalize_i_and_l, - args.clean_taxa - ); - println!("{}", serde_json::to_string(&search_result)?); + /// Returns the suffix array value at the given index. + /// + /// # Arguments + /// + /// * `index` - The index of the suffix array. + /// + /// # Returns + /// + /// The suffix array at the given index. + pub fn get(&self, index: usize) -> i64 { + match self { + SuffixArray::Original(sa, _) => sa[index], + SuffixArray::Compressed(sa, _) => sa.get(index) as i64 } } - let end_time = get_time_ms()?; - - // output to other channel to prevent integrating it into the actual output - eprintln!("Spend {} ms to search the whole file", end_time - start_time); - - Ok(()) + /// Returns whether the suffix array is empty. + /// + /// # Returns + /// + /// Returns `true` if the suffix array is empty, `false` otherwise. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } } /// Custom trait implemented by types that have a value that represents NULL pub trait Nullable { const NULL: T; + /// Returns whether the value is NULL. + /// + /// # Returns + /// + /// True if the value is NULL, false otherwise. fn is_null(&self) -> bool; } +/// Implementation of the `Nullable` trait for the `u32` type. impl Nullable for u32 { const NULL: u32 = u32::MAX; @@ -250,3 +96,83 @@ impl Nullable for u32 { *self == Self::NULL } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_suffix_array_original() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.len(), 5); + assert_eq!(sa.get(0), 1); + assert_eq!(sa.get(1), 2); + assert_eq!(sa.get(2), 3); + assert_eq!(sa.get(3), 4); + assert_eq!(sa.get(4), 5); + } + + #[test] + fn test_suffix_array_compressed() { + let mut bitarray = BitArray::with_capacity(5, 40); + bitarray.set(0, 1); + bitarray.set(1, 2); + bitarray.set(2, 3); + bitarray.set(3, 4); + bitarray.set(4, 5); + + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.len(), 5); + assert_eq!(sa.get(0), 1); + assert_eq!(sa.get(1), 2); + assert_eq!(sa.get(2), 3); + assert_eq!(sa.get(3), 4); + assert_eq!(sa.get(4), 5); + } + + #[test] + fn test_suffix_array_len() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.len(), 5); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.len(), 5); + } + + #[test] + fn test_suffix_array_bits_per_value() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.bits_per_value(), 64); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.bits_per_value(), 40); + } + + #[test] + fn test_suffix_array_sample_rate() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.sample_rate(), 1); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.sample_rate(), 1); + } + + #[test] + fn test_suffix_array_is_empty() { + let sa = SuffixArray::Original(vec![], 1); + assert_eq!(sa.is_empty(), true); + + let bitarray = BitArray::with_capacity(0, 0); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.is_empty(), true); + } + + #[test] + fn test_nullable_is_null() { + assert_eq!(u32::NULL.is_null(), true); + assert_eq!(0u32.is_null(), false); + } +} diff --git a/sa-index/src/main.rs b/sa-index/src/main.rs deleted file mode 100644 index 8fe7fad..0000000 --- a/sa-index/src/main.rs +++ /dev/null @@ -1,13 +0,0 @@ -use clap::Parser; -use sa_index::{ - run, - Arguments -}; - -fn main() { - let args = Arguments::parse(); - if let Err(error) = run(args) { - eprintln!("{}", error); - std::process::exit(1); - }; -} diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 8e2c0dd..7d83914 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -70,7 +70,7 @@ pub fn search_proteins_for_peptide<'a>( let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); // words that are shorter than the sample rate are not searchable - if peptide.len() < searcher.sparseness_factor as usize { + if peptide.len() < searcher.sa.sample_rate() as usize { return None; } @@ -84,6 +84,7 @@ pub fn search_proteins_for_peptide<'a>( } SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, SearchAllSuffixesResult::NoMatches => { + eprintln!("No matches found for peptide: {}", peptide); return None; } }; @@ -277,3 +278,72 @@ pub fn search_all_peptides( result: res } } + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_json_eq(generated_json: &str, expected_json: &str) { + assert_eq!( + generated_json.parse::().unwrap(), + expected_json.parse::().unwrap(), + ); + } + + #[test] + fn test_serialize_output_data() { + let output_data = OutputData { + result: vec![1, 2, 3] + }; + + let generated_json = serde_json::to_string(&output_data).unwrap(); + let expected_json = "{\"result\":[1,2,3]}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_search_result_with_analysis() { + let search_result = SearchResultWithAnalysis { + sequence: "MSKIAALLPSV".to_string(), + lca: Some(1), + taxa: vec![1, 2, 3], + uniprot_accession_numbers: vec!["P12345".to_string(), "P23456".to_string()], + fa: None, + cutoff_used: true + }; + + let generated_json = serde_json::to_string(&search_result).unwrap(); + let expected_json = "{\"sequence\":\"MSKIAALLPSV\",\"lca\":1,\"taxa\":[1,2,3],\"uniprot_accession_numbers\":[\"P12345\",\"P23456\"],\"fa\":null,\"cutoff_used\":true}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_protein_info() { + let protein_info = ProteinInfo { + taxon: 1, + uniprot_accession: "P12345".to_string(), + functional_annotations: vec!["GO:0001234".to_string(), "GO:0005678".to_string()] + }; + + let generated_json = serde_json::to_string(&protein_info).unwrap(); + let expected_json = "{\"taxon\":1,\"uniprot_accession\":\"P12345\",\"functional_annotations\":[\"GO:0001234\",\"GO:0005678\"]}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_search_only_result() { + let search_result = SearchOnlyResult { + sequence: "MSKIAALLPSV".to_string(), + proteins: vec![], + cutoff_used: true + }; + + let generated_json = serde_json::to_string(&search_result).unwrap(); + let expected_json = "{\"sequence\":\"MSKIAALLPSV\",\"proteins\":[],\"cutoff_used\":true}"; + + assert_json_eq(&generated_json, expected_json); + } +} diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 78cc043..29bbc9a 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -19,7 +19,8 @@ use crate::{ Minimum }, suffix_to_protein_index::SuffixToProteinIndex, - Nullable + Nullable, + SuffixArray }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -100,12 +101,11 @@ impl PartialEq for SearchAllSuffixesResult { /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate /// the functional analysis provided by Unipept pub struct Searcher { - sa: Vec, - pub sparseness_factor: u8, - suffix_index_to_protein: Box, - proteins: Proteins, - taxon_id_calculator: TaxonAggregator, - function_aggregator: FunctionAggregator + pub sa: SuffixArray, + pub suffix_index_to_protein: Box, + pub proteins: Proteins, + pub taxon_id_calculator: TaxonAggregator, + pub function_aggregator: FunctionAggregator } impl Searcher { @@ -126,8 +126,7 @@ impl Searcher { /// /// Returns a new Searcher object pub fn new( - sa: Vec, - sparseness_factor: u8, + sa: SuffixArray, suffix_index_to_protein: Box, proteins: Proteins, taxon_id_calculator: TaxonAggregator, @@ -135,7 +134,6 @@ impl Searcher { ) -> Self { Self { sa, - sparseness_factor, suffix_index_to_protein, proteins, taxon_id_calculator, @@ -240,7 +238,8 @@ impl Searcher { while right - left > 1 { let center = (left + right) / 2; let skip = min(lcp_left, lcp_right); - let (retval, lcp_center) = self.compare(search_string, self.sa[center], skip, bound); + let (retval, lcp_center) = + self.compare(search_string, self.sa.get(center), skip, bound); found |= lcp_center == search_string.len(); @@ -258,7 +257,7 @@ impl Searcher { // handle edge case to search at index 0 if right == 1 && left == 0 { let (retval, lcp_center) = - self.compare(search_string, self.sa[0], min(lcp_left, lcp_right), bound); + self.compare(search_string, self.sa.get(0), min(lcp_left, lcp_right), bound); found |= lcp_center == search_string.len(); @@ -322,7 +321,7 @@ impl Searcher { } let mut skip: usize = 0; - while skip < self.sparseness_factor as usize { + while skip < self.sa.sample_rate() as usize { let mut il_locations_start = 0; while il_locations_start < il_locations.len() && il_locations[il_locations_start] < skip { @@ -339,7 +338,7 @@ impl Searcher { // array (stop when our max number of matches is reached) let mut sa_index = min_bound; while sa_index < max_bound { - let suffix = self.sa[sa_index] as usize; + let suffix = self.sa.get(sa_index) as usize; // filter away matches where I was wrongfully equalized to L, and check the // unmatched prefix when I and L equalized, we only need to // check the prefix, not the whole match, when the prefix is 0, we don't need to @@ -549,20 +548,55 @@ impl Searcher { #[cfg(test)] mod tests { - use sa_mappings::functionality::FunctionAggregator; - use sa_mappings::proteins::{Protein, Proteins}; - use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator}; - use crate::sa_searcher::{ - BoundSearchResult, SearchAllSuffixesResult, Searcher, - }; - use crate::suffix_to_protein_index::SparseSuffixToProtein; - use tempdir::TempDir; use std::{ fs::File, io::Write, path::PathBuf }; + use sa_mappings::{ + functionality::FunctionAggregator, + proteins::{ + Protein, + Proteins + }, + taxonomy::{ + AggregationMethod, + TaxonAggregator + } + }; + use tempdir::TempDir; + + use crate::{ + sa_searcher::{ + BoundSearchResult, + SearchAllSuffixesResult, + Searcher + }, + suffix_to_protein_index::SparseSuffixToProtein, + SuffixArray + }; + + #[test] + fn test_partial_eq_search_all_suffixes_result() { + let search_all_suffixes_result_1 = SearchAllSuffixesResult::SearchResult(vec![1, 2, 3]); + let search_all_suffixes_result_2 = SearchAllSuffixesResult::SearchResult(vec![3, 2, 1]); + let search_all_suffixes_result_3 = SearchAllSuffixesResult::SearchResult(vec![1, 2, 4]); + let search_all_suffixes_result_4 = SearchAllSuffixesResult::MaxMatches(vec![1, 2, 3]); + let search_all_suffixes_result_5 = SearchAllSuffixesResult::MaxMatches(vec![3, 2, 1]); + let search_all_suffixes_result_6 = SearchAllSuffixesResult::MaxMatches(vec![1, 2, 4]); + let search_all_suffixes_result_7 = SearchAllSuffixesResult::NoMatches; + let search_all_suffixes_result_8 = SearchAllSuffixesResult::NoMatches; + + assert_eq!(search_all_suffixes_result_1, search_all_suffixes_result_2); + assert_ne!(search_all_suffixes_result_1, search_all_suffixes_result_3); + assert_eq!(search_all_suffixes_result_4, search_all_suffixes_result_5); + assert_ne!(search_all_suffixes_result_4, search_all_suffixes_result_6); + assert_eq!(search_all_suffixes_result_7, search_all_suffixes_result_8); + assert_ne!(search_all_suffixes_result_1, search_all_suffixes_result_7); + assert_ne!(search_all_suffixes_result_4, search_all_suffixes_result_7); + } + fn create_taxonomy_file(tmp_dir: &TempDir) -> PathBuf { let taxonomy_file = tmp_dir.path().join("taxonomy.tsv"); let mut file = File::create(&taxonomy_file).unwrap(); @@ -585,52 +619,55 @@ mod tests { taxonomy_file } - fn get_example_proteins() -> Proteins { let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes(); Proteins { input_string: text, - proteins: vec![ + proteins: vec![ Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] }, Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] }, Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] }, Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] }, - ], + ] } } #[test] fn test_search_simple() { let proteins = get_example_proteins(); - let sa = vec![ - 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, - ]; + let sa = SuffixArray::Original( + vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], + 1 + ); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); @@ -650,52 +687,53 @@ mod tests { #[test] fn test_search_sparse() { let proteins = get_example_proteins(); - let sa = vec![9, 0, 3, 12, 15, 6, 18]; + let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 3, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); // search suffix 'VAA' let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![7]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7])); // search suffix 'AC' let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![5, 11]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11])); } #[test] fn test_il_equality() { let proteins = get_example_proteins(); - let sa = vec![ - 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, - ]; + let sa = SuffixArray::Original( + vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], + 1 + ); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); @@ -710,27 +748,27 @@ mod tests { #[test] fn test_il_equality_sparse() { let proteins = get_example_proteins(); - let sa = vec![9, 0, 3, 12, 15, 6, 18]; + let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 3, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); // search bounds 'RIZ' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![16]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16])); // search bounds 'RIZ' without equal I and L let found_suffixes = @@ -745,32 +783,32 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], - }], + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] + }] }; let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = vec![0, 2, 4]; + let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); let searcher = Searcher::new( sparse_sa, - 2, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); // search bounds 'IM' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![0]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0])); } #[test] @@ -779,31 +817,31 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], - }], + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] + }] }; let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = vec![6, 0, 1, 5, 4, 3, 2]; + let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5])); } #[test] @@ -812,31 +850,31 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], - }], + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] + }] }; let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0]; + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])); } #[test] @@ -845,33 +883,33 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], - }], + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] + }] }; let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = vec![6, 4, 2, 0]; + let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); let searcher = Searcher::new( sparse_sa, - 2, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); // search all places where II is in the string IIIILL, but with a sparse SA // this way we check if filtering the suffixes works as expected let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![0, 1, 2]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2])); } #[test] @@ -880,31 +918,31 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, - functional_annotations: vec![], - }], + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![] + }] }; let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0]; + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, - TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(), FunctionAggregator {} ); // search bounds 'IM' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); - assert_eq!( - found_suffixes, - SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]) - ); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])); } -} \ No newline at end of file +} diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 4144f38..0091fed 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -116,6 +116,7 @@ impl SparseSuffixToProtein { #[cfg(test)] mod tests { + use clap::ValueEnum; use sa_mappings::proteins::{ SEPARATION_CHARACTER, TERMINATION_CHARACTER @@ -125,7 +126,8 @@ mod tests { suffix_to_protein_index::{ DenseSuffixToProtein, SparseSuffixToProtein, - SuffixToProteinIndex + SuffixToProteinIndex, + SuffixToProteinMappingStyle }, Nullable }; @@ -136,6 +138,18 @@ mod tests { text.into_bytes() } + #[test] + fn test_suffix_to_protein_mapping_style() { + assert_eq!( + SuffixToProteinMappingStyle::Dense, + SuffixToProteinMappingStyle::from_str("dense", false).unwrap() + ); + assert_eq!( + SuffixToProteinMappingStyle::Sparse, + SuffixToProteinMappingStyle::from_str("sparse", false).unwrap() + ); + } + #[test] fn test_dense_build() { let u8_text = &build_text(); diff --git a/sa-index/src/util.rs b/sa-index/src/util.rs deleted file mode 100644 index 2f3f467..0000000 --- a/sa-index/src/util.rs +++ /dev/null @@ -1,63 +0,0 @@ -use std::{ - fs::File, - io, - io::BufRead, - path::Path, - time::{ - SystemTime, - SystemTimeError, - UNIX_EPOCH - } -}; - -use crate::sa_searcher::Searcher; - -/// Gets the current time in ms -/// -/// # Returns -/// -/// Returns the current time in ms -/// -/// # Errors -/// -/// Returns a SystemTimeError if getting the current time somehow fails -#[allow(unused)] -pub fn get_time_ms() -> Result { - Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6) -} - -/// Times how long the function `f`, that has the searcher as only argument executed -/// -/// # Returns -/// -/// Returns the execution time of `f`in ms -/// -/// # Errors -/// -/// Returns a SystemTimeError if getting the start or end time failed -#[allow(unused)] -pub fn time_execution( - searcher: &mut Searcher, - f: &dyn Fn(&mut Searcher) -> bool -) -> Result<(bool, f64), SystemTimeError> { - let start_ms = get_time_ms()?; - let found = f(searcher); - let end_ms = get_time_ms()?; - Ok((found, end_ms - start_ms)) -} - -/// Opens `filename` and creates an iterator over it per line -/// -/// # Arguments -/// * `filename` - The file we want to iterate over per line -/// -/// # Returns -/// -/// Returns an Iterator to the Reader of the lines of the file. -pub fn read_lines

(filename: P) -> io::Result>> -where - P: AsRef -{ - let file = File::open(filename)?; - Ok(io::BufReader::new(file).lines()) -} diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs index 8b60eb6..45c446e 100644 --- a/sa-mappings/src/functionality.rs +++ b/sa-mappings/src/functionality.rs @@ -14,9 +14,9 @@ use crate::proteins::Protein; #[derive(Debug, Serialize)] pub struct FunctionalAggregation { /// A HashMap representing how many GO, EC and IPR terms were found - pub counts: HashMap, + counts: HashMap, /// A HashMap representing how often a certain functional annotation was found - pub data: HashMap + data: HashMap } /// A struct that represents a function aggregator @@ -32,6 +32,9 @@ impl FunctionAggregator { /// /// Returns a JSON string containing the aggregated functional annotations pub fn aggregate(&self, proteins: Vec<&Protein>) -> FunctionalAggregation { + // Keep track of the proteins that have any annotation + let mut proteins_with_annotations: HashSet = HashSet::new(); + // Keep track of the proteins that have a certain annotation let mut proteins_with_ec: HashSet = HashSet::new(); let mut proteins_with_go: HashSet = HashSet::new(); @@ -43,10 +46,19 @@ impl FunctionAggregator { for protein in proteins.iter() { for annotation in protein.get_functional_annotations().split(';') { match annotation.chars().next() { - Some('E') => proteins_with_ec.insert(protein.uniprot_id.clone()), - Some('G') => proteins_with_go.insert(protein.uniprot_id.clone()), - Some('I') => proteins_with_ipr.insert(protein.uniprot_id.clone()), - _ => false + Some('E') => { + proteins_with_ec.insert(protein.uniprot_id.clone()); + proteins_with_annotations.insert(protein.uniprot_id.clone()); + } + Some('G') => { + proteins_with_go.insert(protein.uniprot_id.clone()); + proteins_with_annotations.insert(protein.uniprot_id.clone()); + } + Some('I') => { + proteins_with_ipr.insert(protein.uniprot_id.clone()); + proteins_with_annotations.insert(protein.uniprot_id.clone()); + } + _ => {} }; data.entry(annotation.to_string()) @@ -56,7 +68,7 @@ impl FunctionAggregator { } let mut counts: HashMap = HashMap::new(); - counts.insert("all".to_string(), proteins.len()); + counts.insert("all".to_string(), proteins_with_annotations.len()); counts.insert("EC".to_string(), proteins_with_ec.len()); counts.insert("GO".to_string(), proteins_with_go.len()); counts.insert("IPR".to_string(), proteins_with_ipr.len()); @@ -100,30 +112,51 @@ mod tests { #[test] fn test_aggregate() { let mut proteins: Vec = Vec::new(); - proteins.push(Protein { + + let protein1 = Protein { uniprot_id: "P12345".to_string(), taxon_id: 9606, functional_annotations: encode("GO:0001234;GO:0005678") - }); - proteins.push(Protein { + }; + let protein2 = Protein { uniprot_id: "P23456".to_string(), taxon_id: 9606, functional_annotations: encode("EC:1.1.1.-") - }); + }; + let protein3 = Protein { + uniprot_id: "P23876".to_string(), + taxon_id: 9606, + functional_annotations: encode("IPR:IPR123456;EC:1.1.1.-") + }; + let protein4 = Protein { + uniprot_id: "P23877".to_string(), + taxon_id: 9606, + functional_annotations: encode("2345") + }; + + proteins.push(protein1); + proteins.push(protein2); + proteins.push(protein3); + proteins.push(protein4); let function_aggregator = FunctionAggregator {}; let result = function_aggregator.aggregate(proteins.iter().collect()); - assert_eq!(result.counts.get("all"), Some(&2)); - assert_eq!(result.counts.get("EC"), Some(&1)); + assert_eq!(result.counts.get("all"), Some(&3)); + assert_eq!(result.counts.get("EC"), Some(&2)); assert_eq!(result.counts.get("GO"), Some(&1)); - assert_eq!(result.counts.get("IPR"), Some(&0)); + assert_eq!(result.counts.get("IPR"), Some(&1)); assert_eq!(result.counts.get("NOTHING"), None); - assert_eq!(result.data.get("GO:0001234"), Some(&1)); - assert_eq!(result.data.get("GO:0005678"), Some(&1)); - assert_eq!(result.data.get("EC:1.1.1.-"), Some(&1)); + assert_eq!(result.data, { + let mut map = HashMap::new(); + map.insert("GO:0001234".to_string(), 1); + map.insert("GO:0005678".to_string(), 1); + map.insert("EC:1.1.1.-".to_string(), 2); + map.insert("IPR:IPR123456".to_string(), 1); + map + }); assert_eq!(result.data.get("EC:1.1.2.-"), None); } @@ -141,17 +174,24 @@ mod tests { taxon_id: 9606, functional_annotations: encode("EC:1.1.1.-") }; + let protein3 = Protein { + uniprot_id: "P23876".to_string(), + taxon_id: 9606, + functional_annotations: encode("IPR:IPR123456;EC:1.1.1.-") + }; proteins.push(&protein1); proteins.push(&protein2); + proteins.push(&protein3); let function_aggregator = FunctionAggregator {}; let result = function_aggregator.get_all_functional_annotations(proteins.as_slice()); - assert_eq!(result.len(), 2); + assert_eq!(result.len(), 3); assert_eq!(result[0].len(), 2); assert_eq!(result[1].len(), 1); + assert_eq!(result[2].len(), 2); } #[test] diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index 900c531..e1d2f49 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -10,7 +10,10 @@ use std::{ }; use bytelines::ByteLines; -use fa_compression::algorithm1::decode; +use fa_compression::algorithm1::{ + decode, + encode +}; use umgap::taxon::TaxonId; use crate::taxonomy::TaxonAggregator; @@ -84,7 +87,7 @@ impl Proteins { let uniprot_id = from_utf8(fields.next().unwrap())?; let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; let sequence = from_utf8(fields.next().unwrap())?; - let functional_annotations: Vec = fields.next().unwrap().to_vec(); + let functional_annotations: Vec = encode(from_utf8(fields.next().unwrap())?); if !taxon_aggregator.taxon_exists(taxon_id) { continue; @@ -185,26 +188,21 @@ mod tests { let database_file = tmp_dir.path().join("database.tsv"); let mut file = File::create(&database_file).unwrap(); - file.write("P12345\t1\tMLPGLALLLLAAWTARALEV\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P67890\t6\tKWDSDPSGTKTCIDT\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) + file.write( + "P12345\t1\tMLPGLALLLLAAWTARALEV\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() + ) + .unwrap(); + file.write( + "P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n" + .as_bytes() + ) + .unwrap(); + file.write( + "P67890\t6\tKWDSDPSGTKTCIDT\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() + ) + .unwrap(); + file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes()) .unwrap(); - file.write("\n".as_bytes()).unwrap(); database_file } diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs index a1aabb7..70c65f6 100644 --- a/sa-mappings/src/taxonomy.rs +++ b/sa-mappings/src/taxonomy.rs @@ -15,6 +15,7 @@ use umgap::{ }, taxon::{ read_taxa_file, + Taxon, TaxonId, TaxonList, TaxonTree @@ -24,11 +25,9 @@ use umgap::{ /// A struct that represents a taxon aggregator. pub struct TaxonAggregator { /// A vector that contains the snapped taxon IDs. - snapping: Vec>, - + snapping: Vec>, /// The aggregator used to aggregate taxon IDs. aggregator: Box, - /// The taxon list. taxon_list: TaxonList } @@ -43,6 +42,33 @@ pub enum AggregationMethod { } impl TaxonAggregator { + /// Creates a new `TaxonAggregator` with the given taxa and aggregation method. + /// + /// # Arguments + /// + /// * `taxa` - A vector of `Taxon` objects representing the taxa. + /// * `method` - An `AggregationMethod` enum specifying the aggregation method to use. + /// + /// # Returns + /// + /// Returns a new `TaxonAggregator` instance. + pub fn new(taxa: Vec, method: AggregationMethod) -> Self { + let taxon_tree = TaxonTree::new(&taxa); + let taxon_list = TaxonList::new(taxa); + let snapping = taxon_tree.snapping(&taxon_list, true); + + let aggregator: Box = match method { + AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), + AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) + }; + + Self { + snapping, + aggregator, + taxon_list + } + } + /// Creates a new `TaxonAggregator` from a taxonomy file and an aggregation method. /// /// # Arguments @@ -62,20 +88,7 @@ impl TaxonAggregator { method: AggregationMethod ) -> Result> { let taxons = read_taxa_file(file)?; - let taxon_tree = TaxonTree::new(&taxons); - let taxon_list = TaxonList::new(taxons); - let snapping = taxon_tree.snapping(&taxon_list, true); - - let aggregator: Box = match method { - AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), - AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) - }; - - Ok(Self { - snapping, - aggregator, - taxon_list - }) + Ok(Self::new(taxons, method)) } /// Checks if a taxon exists in the taxon list. @@ -157,6 +170,7 @@ mod tests { }; use tempdir::TempDir; + use umgap::rank::Rank; use super::*; @@ -183,6 +197,30 @@ mod tests { taxonomy_file } + #[test] + fn test_new() { + TaxonAggregator::new( + vec![ + Taxon::new(1, "root".to_string(), Rank::NoRank, 1, true), + Taxon::new(2, "Bacteria".to_string(), Rank::Superkingdom, 1, true), + Taxon::new(6, "Azorhizobium".to_string(), Rank::Genus, 1, true), + Taxon::new(7, "Azorhizobium caulinodans".to_string(), Rank::Species, 6, true), + Taxon::new(9, "Buchnera aphidicola".to_string(), Rank::Species, 6, true), + Taxon::new(10, "Cellvibrio".to_string(), Rank::Genus, 6, true), + Taxon::new(11, "Cellulomonas gilvus".to_string(), Rank::Species, 10, true), + Taxon::new(13, "Dictyoglomus".to_string(), Rank::Genus, 11, true), + Taxon::new(14, "Dictyoglomus thermophilum".to_string(), Rank::Species, 10, true), + Taxon::new(16, "Methylophilus".to_string(), Rank::Genus, 14, true), + Taxon::new(17, "Methylophilus methylotrophus".to_string(), Rank::Species, 16, true), + Taxon::new(18, "Pelobacter".to_string(), Rank::Genus, 17, true), + Taxon::new(19, "Syntrophotalea carbinolica".to_string(), Rank::Species, 17, true), + Taxon::new(20, "Phenylobacterium".to_string(), Rank::Genus, 19, true), + Taxon::new(21, "Invalid".to_string(), Rank::Species, 19, false), + ], + AggregationMethod::Lca + ); + } + #[test] fn test_try_from_taxonomy_file() { // Create a temporary directory for this test diff --git a/sa-server/Cargo.toml b/sa-server/Cargo.toml index c1920f5..935e36b 100644 --- a/sa-server/Cargo.toml +++ b/sa-server/Cargo.toml @@ -12,4 +12,5 @@ tokio = { version = "1.36.0", features = ["rt", "rt-multi-thread", "macros"] } sa-index = { path = "../sa-index" } clap = { version = "4.5.1", features = ["derive"] } sa-builder = { path = "../sa-builder" } -sa-mappings = { path = "../sa-mappings" } \ No newline at end of file +sa-mappings = { path = "../sa-mappings" } +sa-compression = { path = "../sa-compression" } diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index 98f2981..ef774b2 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -1,5 +1,10 @@ use std::{ error::Error, + fs::File, + io::{ + BufReader, + Read + }, sync::Arc }; @@ -17,8 +22,9 @@ use axum::{ Router }; use clap::Parser; -use sa_builder::binary::load_suffix_array; +use sa_compression::load_compressed_suffix_array; use sa_index::{ + binary::load_suffix_array, peptide_search::{ analyse_all_peptides, search_all_peptides, @@ -27,7 +33,8 @@ use sa_index::{ SearchResultWithAnalysis }, sa_searcher::Searcher, - suffix_to_protein_index::SparseSuffixToProtein + suffix_to_protein_index::SparseSuffixToProtein, + SuffixArray }; use sa_mappings::{ functionality::FunctionAggregator, @@ -168,23 +175,34 @@ async fn start_server(args: Arguments) -> Result<(), Box> { taxonomy } = args; - eprintln!("Loading suffix array..."); - let (sparseness_factor, sa) = load_suffix_array(&index_file)?; + eprintln!(); + eprintln!("📋 Started loading the suffix array..."); + let sa = load_suffix_array_file(&index_file)?; + eprintln!("✅ Successfully loaded the suffix array!"); + eprintln!("\tAmount of items: {}", sa.len()); + eprintln!("\tAmount of bits per item: {}", sa.bits_per_value()); + eprintln!("\tSample rate: {}", sa.sample_rate()); - eprintln!("Loading taxon file..."); + eprintln!(); + eprintln!("📋 Started loading the taxon file..."); let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)?; + eprintln!("✅ Successfully loaded the taxon file!"); + eprintln!("\tAggregation method: LCA*"); + eprintln!(); + eprintln!("📋 Started creating the function aggregator..."); let function_aggregator = FunctionAggregator {}; + eprintln!("✅ Successfully created the function aggregator!"); - eprintln!("Loading proteins..."); + eprintln!(); + eprintln!("📋 Started loading the proteins..."); let proteins = Proteins::try_from_database_file(&database_file, &taxon_id_calculator)?; let suffix_index_to_protein = Box::new(SparseSuffixToProtein::new(&proteins.input_string)); + eprintln!("✅ Successfully loaded the proteins!"); - eprintln!("Creating searcher..."); let searcher = Arc::new(Searcher::new( sa, - sparseness_factor, suffix_index_to_protein, proteins, taxon_id_calculator, @@ -205,8 +223,31 @@ async fn start_server(args: Arguments) -> Result<(), Box> { .with_state(searcher); let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await?; - println!("server is ready..."); + + eprintln!(); + eprintln!("🚀 Server is ready..."); axum::serve(listener, app).await?; Ok(()) } + +fn load_suffix_array_file(file: &str) -> Result> { + // Open the suffix array file + let mut sa_file = File::open(file)?; + + // Create a buffer reader for the file + let mut reader = BufReader::new(&mut sa_file); + + // Read the bits per value from the binary file (1 byte) + let mut bits_per_value_buffer = [0_u8; 1]; + reader + .read_exact(&mut bits_per_value_buffer) + .map_err(|_| "Could not read the flags from the binary file")?; + let bits_per_value = bits_per_value_buffer[0]; + + if bits_per_value == 64 { + load_suffix_array(&mut reader) + } else { + load_compressed_suffix_array(&mut reader, bits_per_value as usize) + } +}