From f77a28692daa846b61e22adc6e3a516a2ab4ff76 Mon Sep 17 00:00:00 2001 From: Tibo Vande Moortele <34175340+tibvdm@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:40:00 +0200 Subject: [PATCH] Feature/fa compression 2 (#17) Add second compression option --- codecov.yml | 7 + fa-compression/Cargo.toml | 4 + fa-compression/benches/algorithm1/decode.rs | 31 +++ fa-compression/benches/algorithm1/encode.rs | 28 ++ fa-compression/benches/algorithm1/mod.rs | 8 + fa-compression/benches/algorithm2/decode.rs | 38 +++ fa-compression/benches/algorithm2/encode.rs | 35 +++ fa-compression/benches/algorithm2/mod.rs | 8 + fa-compression/benches/bench_main.rs | 35 +-- fa-compression/benches/util.rs | 28 -- fa-compression/src/{ => algorithm1}/decode.rs | 4 +- fa-compression/src/{ => algorithm1}/encode.rs | 4 +- fa-compression/src/algorithm1/mod.rs | 258 +++++++++++++++++ fa-compression/src/algorithm2/decode.rs | 103 +++++++ fa-compression/src/algorithm2/encode.rs | 97 +++++++ fa-compression/src/algorithm2/mod.rs | 146 ++++++++++ fa-compression/src/lib.rs | 262 +----------------- 17 files changed, 777 insertions(+), 319 deletions(-) create mode 100644 fa-compression/benches/algorithm1/decode.rs create mode 100644 fa-compression/benches/algorithm1/encode.rs create mode 100644 fa-compression/benches/algorithm1/mod.rs create mode 100644 fa-compression/benches/algorithm2/decode.rs create mode 100644 fa-compression/benches/algorithm2/encode.rs create mode 100644 fa-compression/benches/algorithm2/mod.rs rename fa-compression/src/{ => algorithm1}/decode.rs (98%) rename fa-compression/src/{ => algorithm1}/encode.rs (98%) create mode 100644 fa-compression/src/algorithm1/mod.rs create mode 100644 fa-compression/src/algorithm2/decode.rs create mode 100644 fa-compression/src/algorithm2/encode.rs create mode 100644 fa-compression/src/algorithm2/mod.rs diff --git a/codecov.yml b/codecov.yml index f49eb98..ec42d7c 100644 --- a/codecov.yml +++ b/codecov.yml @@ -7,6 +7,13 @@ coverage: target: 90% flags: - fa-compression + patch: + default: + target: 90% + fa-compression: + target: 90% + flags: + - fa-compression flags: fa-compression: diff --git a/fa-compression/Cargo.toml b/fa-compression/Cargo.toml index 29496ae..9bff609 100644 --- a/fa-compression/Cargo.toml +++ b/fa-compression/Cargo.toml @@ -10,5 +10,9 @@ criterion = "0.5.1" rand = "0.8.5" [[bench]] +opt-level = 0 name = "bench_main" harness = false + +[profile.bench] +opt-level=3 diff --git a/fa-compression/benches/algorithm1/decode.rs b/fa-compression/benches/algorithm1/decode.rs new file mode 100644 index 0000000..07bdc41 --- /dev/null +++ b/fa-compression/benches/algorithm1/decode.rs @@ -0,0 +1,31 @@ +use criterion::black_box; +use fa_compression::algorithm1::{ + decode, + encode +}; + +use super::util::generate_annotation; + +/// Generate a random number of encoded annotations. +fn generate_encoded_annotations(count: usize) -> Vec { + let mut random = rand::thread_rng(); + + let mut annotations = String::new(); + for _ in 0 .. count { + annotations.push_str(&generate_annotation(&mut random)); + annotations.push(';'); + } + annotations.pop(); + + encode(annotations.as_str()) +} + +pub fn decode_benchmark(c: &mut criterion::Criterion) { + c.bench_function("decode_algorithm1", |b| { + b.iter_batched( + || generate_encoded_annotations(100), + |annotations| black_box(decode(annotations.as_slice())), + criterion::BatchSize::SmallInput + ) + }); +} diff --git a/fa-compression/benches/algorithm1/encode.rs b/fa-compression/benches/algorithm1/encode.rs new file mode 100644 index 0000000..e134c6a --- /dev/null +++ b/fa-compression/benches/algorithm1/encode.rs @@ -0,0 +1,28 @@ +use criterion::black_box; +use fa_compression::algorithm1::encode; + +use super::util::generate_annotation; + +/// Generate a random number of decoded annotations. +fn generate_decoded_annotations(count: usize) -> String { + let mut random = rand::thread_rng(); + + let mut annotations = String::new(); + for _ in 0 .. count { + annotations.push_str(&generate_annotation(&mut random)); + annotations.push(';'); + } + + annotations.pop(); + annotations +} + +pub fn encode_benchmark(c: &mut criterion::Criterion) { + c.bench_function("encode_algorithm1", |b| { + b.iter_batched( + || generate_decoded_annotations(100), + |annotations| black_box(encode(annotations.as_str())), + criterion::BatchSize::SmallInput + ) + }); +} diff --git a/fa-compression/benches/algorithm1/mod.rs b/fa-compression/benches/algorithm1/mod.rs new file mode 100644 index 0000000..b191ba4 --- /dev/null +++ b/fa-compression/benches/algorithm1/mod.rs @@ -0,0 +1,8 @@ +use criterion::criterion_group; + +use super::util; + +mod decode; +mod encode; + +criterion_group!(benches, encode::encode_benchmark, decode::decode_benchmark); diff --git a/fa-compression/benches/algorithm2/decode.rs b/fa-compression/benches/algorithm2/decode.rs new file mode 100644 index 0000000..a70d6b4 --- /dev/null +++ b/fa-compression/benches/algorithm2/decode.rs @@ -0,0 +1,38 @@ +use criterion::black_box; +use fa_compression::algorithm2::{ + decode, + encode, + CompressionTable +}; + +use super::util::generate_annotation; + +fn generate_encoded_annotations_and_table(count: usize) -> (Vec, CompressionTable) { + let mut random = rand::thread_rng(); + + let mut compression_table1 = CompressionTable::new(); + let mut compression_table2 = CompressionTable::new(); + + let mut annotations = String::new(); + for _ in 0 .. count { + let annotation = generate_annotation(&mut random); + annotations.push_str(&annotation); + annotations.push(';'); + compression_table1.add_entry(annotation.clone()); + compression_table2.add_entry(annotation); + } + + annotations.pop(); + + (encode(annotations.as_str(), compression_table1), compression_table2) +} + +pub fn decode_benchmark(c: &mut criterion::Criterion) { + c.bench_function("decode_algorithm2", |b| { + b.iter_batched( + || generate_encoded_annotations_and_table(100), + |(annotations, ct)| black_box(decode(annotations.as_slice(), ct)), + criterion::BatchSize::SmallInput + ) + }); +} diff --git a/fa-compression/benches/algorithm2/encode.rs b/fa-compression/benches/algorithm2/encode.rs new file mode 100644 index 0000000..e1729f7 --- /dev/null +++ b/fa-compression/benches/algorithm2/encode.rs @@ -0,0 +1,35 @@ +use criterion::black_box; +use fa_compression::algorithm2::{ + encode, + CompressionTable +}; + +use super::util::generate_annotation; + +fn generate_decoded_annotations_and_table(count: usize) -> (String, CompressionTable) { + let mut random = rand::thread_rng(); + + let mut compression_table = CompressionTable::new(); + + let mut annotations = String::new(); + for _ in 0 .. count { + let annotation = generate_annotation(&mut random); + annotations.push_str(&annotation); + annotations.push(';'); + compression_table.add_entry(annotation); + } + + annotations.pop(); + + (annotations, compression_table) +} + +pub fn encode_benchmark(c: &mut criterion::Criterion) { + c.bench_function("encode_algorithm2", |b| { + b.iter_batched( + || generate_decoded_annotations_and_table(100), + |(annotations, ct)| black_box(encode(annotations.as_str(), ct)), + criterion::BatchSize::SmallInput + ) + }); +} diff --git a/fa-compression/benches/algorithm2/mod.rs b/fa-compression/benches/algorithm2/mod.rs new file mode 100644 index 0000000..b191ba4 --- /dev/null +++ b/fa-compression/benches/algorithm2/mod.rs @@ -0,0 +1,8 @@ +use criterion::criterion_group; + +use super::util; + +mod decode; +mod encode; + +criterion_group!(benches, encode::encode_benchmark, decode::decode_benchmark); diff --git a/fa-compression/benches/bench_main.rs b/fa-compression/benches/bench_main.rs index 8e2fcce..d1df7fe 100644 --- a/fa-compression/benches/bench_main.rs +++ b/fa-compression/benches/bench_main.rs @@ -1,34 +1,7 @@ -use criterion::{ - black_box, - criterion_group, - criterion_main -}; -use fa_compression::{ - decode, - encode -}; +use criterion::criterion_main; +mod algorithm1; +mod algorithm2; mod util; -fn encode_benchmark(c: &mut criterion::Criterion) { - c.bench_function("encode", |b| { - b.iter_batched( - || util::generate_decoded_annotations(100), - |annotations| black_box(encode(annotations.as_str())), - criterion::BatchSize::SmallInput - ) - }); -} - -fn decode_benchmark(c: &mut criterion::Criterion) { - c.bench_function("decode", |b| { - b.iter_batched( - || util::generate_encoded_annotations(100), - |annotations| black_box(decode(annotations.as_slice())), - criterion::BatchSize::SmallInput - ) - }); -} - -criterion_group!(benches, encode_benchmark, decode_benchmark); -criterion_main!(benches); +criterion_main!(algorithm1::benches, algorithm2::benches); diff --git a/fa-compression/benches/util.rs b/fa-compression/benches/util.rs index c46c64e..47d9990 100644 --- a/fa-compression/benches/util.rs +++ b/fa-compression/benches/util.rs @@ -1,4 +1,3 @@ -use fa_compression::encode; use rand::{ rngs::ThreadRng, Rng @@ -34,30 +33,3 @@ pub fn generate_annotation(random: &mut ThreadRng) -> String { _ => unreachable!() } } - -/// Generate a random number of decoded annotations. -pub fn generate_decoded_annotations(count: usize) -> String { - let mut random = rand::thread_rng(); - - let mut annotations = String::new(); - for _ in 0 .. count { - annotations.push_str(&generate_annotation(&mut random)); - annotations.push(';'); - } - annotations.pop(); - annotations -} - -/// Generate a random number of encoded annotations. -pub fn generate_encoded_annotations(count: usize) -> Vec { - let mut random = rand::thread_rng(); - - let mut annotations = String::new(); - for _ in 0 .. count { - annotations.push_str(&generate_annotation(&mut random)); - annotations.push(';'); - } - annotations.pop(); - - encode(annotations.as_str()) -} diff --git a/fa-compression/src/decode.rs b/fa-compression/src/algorithm1/decode.rs similarity index 98% rename from fa-compression/src/decode.rs rename to fa-compression/src/algorithm1/decode.rs index 1568102..453f107 100644 --- a/fa-compression/src/decode.rs +++ b/fa-compression/src/algorithm1/decode.rs @@ -1,7 +1,7 @@ //! This module provides a function to decode a byte array into a string representation of //! annotations. -use crate::{ +use super::{ CharacterSet, Decode }; @@ -26,7 +26,7 @@ static PREFIXES: [&str; 3] = ["EC:", "GO:", "IPR:IPR"]; /// # Examples /// /// ``` -/// use fa_compression::decode; +/// use fa_compression::algorithm1::decode; /// /// let input = &[ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39 ]; /// let result = decode(input); diff --git a/fa-compression/src/encode.rs b/fa-compression/src/algorithm1/encode.rs similarity index 98% rename from fa-compression/src/encode.rs rename to fa-compression/src/algorithm1/encode.rs index 9249b8c..8cce9cd 100644 --- a/fa-compression/src/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -1,6 +1,6 @@ //! This module contains the function to encode the input string into a compressed byte vector. -use crate::{ +use super::{ CharacterSet, Encode }; @@ -23,7 +23,7 @@ use crate::{ /// # Examples /// /// ``` -/// use fa_compression::encode; +/// use fa_compression::algorithm1::encode; /// /// let input = "IPR:IPR016364;EC:1.1.1.-;GO:0009279"; /// let encoded = encode(input); diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs new file mode 100644 index 0000000..3a1d9b4 --- /dev/null +++ b/fa-compression/src/algorithm1/mod.rs @@ -0,0 +1,258 @@ +//! The `fa-compression` crate provides functions to encode and decode annotations following a +//! specific format + +use std::ops::BitOr; + +mod decode; +mod encode; + +pub use decode::decode; +pub use encode::encode; + +/// Trait for encoding a value into a character set. +trait Encode { + /// Encodes the given value into a character set. + /// + /// # Arguments + /// + /// * `value` - The value to be encoded. + /// + /// # Returns + /// + /// The encoded character set. + fn encode(value: u8) -> CharacterSet; +} + +/// Trait for decoding a value from a character set. +trait Decode { + /// Decodes the given value from a character set into a character. + /// + /// # Arguments + /// + /// * `value` - The value to be decoded. + /// + /// # Returns + /// + /// The decoded character. + fn decode(value: u8) -> char; + + /// Decodes a pair of values from a character set into a pair of characters. + /// + /// # Arguments + /// + /// * `value` - The value to be decoded. + /// + /// # Returns + /// + /// A tuple containing the decoded characters. + fn decode_pair(value: u8) -> (char, char) { + (Self::decode(value >> 4), Self::decode(value & 0b1111)) + } +} + +/// Enum representing the set of characters that can be encoded. +#[repr(u8)] +#[cfg_attr(test, derive(Clone, Copy))] +#[derive(PartialEq, Eq, Debug)] +enum CharacterSet { + /// Empty placeholder character + Empty, + + /// Numeric characters + Zero, + One, + Two, + Three, + Four, + Five, + Six, + Seven, + Eight, + Nine, + + /// Special Enzyme Commission characters + Dash, + Point, + + /// Different annotation type separator + Comma, + + /// Annotation separator + Semicolon +} + +impl Encode for CharacterSet { + /// Encodes the given value into a character set. + /// + /// # Arguments + /// + /// * `value` - The value to be encoded. + /// + /// # Returns + /// + /// The encoded character set. + fn encode(value: u8) -> CharacterSet { + match value { + b'$' => CharacterSet::Empty, + b'0' => CharacterSet::Zero, + b'1' => CharacterSet::One, + b'2' => CharacterSet::Two, + b'3' => CharacterSet::Three, + b'4' => CharacterSet::Four, + b'5' => CharacterSet::Five, + b'6' => CharacterSet::Six, + b'7' => CharacterSet::Seven, + b'8' => CharacterSet::Eight, + b'9' => CharacterSet::Nine, + b'-' => CharacterSet::Dash, + b'.' => CharacterSet::Point, + b',' => CharacterSet::Comma, + b';' => CharacterSet::Semicolon, + _ => panic!("Invalid character") + } + } +} + +impl Decode for CharacterSet { + /// Decodes the given value from a character set into a character. + /// + /// # Arguments + /// + /// * `value` - The value to be decoded. + /// + /// # Returns + /// + /// The decoded character. + fn decode(value: u8) -> char { + match value { + 0 => '$', + 1 => '0', + 2 => '1', + 3 => '2', + 4 => '3', + 5 => '4', + 6 => '5', + 7 => '6', + 8 => '7', + 9 => '8', + 10 => '9', + 11 => '-', + 12 => '.', + 13 => ',', + 14 => ';', + _ => panic!("Invalid character") + } + } +} + +impl BitOr for CharacterSet { + type Output = u8; + + /// Performs a bitwise OR operation between two character sets. + /// + /// # Arguments + /// + /// * `self` - The left-hand side character set. + /// * `rhs` - The right-hand side character set. + /// + /// # Returns + /// + /// The result of the bitwise OR operation. + fn bitor(self, rhs: Self) -> Self::Output { + ((self as u8) << 4) | rhs as u8 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + static CHARACTERS: [u8; 15] = + [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b',', b';']; + + static CHARACTER_SETS: [CharacterSet; 15] = [ + CharacterSet::Empty, + CharacterSet::Zero, + CharacterSet::One, + CharacterSet::Two, + CharacterSet::Three, + CharacterSet::Four, + CharacterSet::Five, + CharacterSet::Six, + CharacterSet::Seven, + CharacterSet::Eight, + CharacterSet::Nine, + CharacterSet::Dash, + CharacterSet::Point, + CharacterSet::Comma, + CharacterSet::Semicolon + ]; + + #[test] + fn test_or() { + for i in 0 .. CHARACTERS.len() { + for j in 0 .. CHARACTERS.len() { + assert_eq!(CHARACTER_SETS[i] | CHARACTER_SETS[j], ((i as u8) << 4) | (j as u8)); + } + } + } + + #[test] + fn test_encode() { + for i in 0 .. CHARACTERS.len() { + assert_eq!(CHARACTER_SETS[i], CharacterSet::encode(CHARACTERS[i])); + } + } + + #[test] + fn test_decode() { + for (i, c) in CHARACTERS.iter().enumerate() { + assert_eq!(CharacterSet::decode(i as u8), *c as char); + } + } + + #[test] + fn test_decode_pair() { + for (i1, c1) in CHARACTERS.iter().enumerate() { + for (i2, c2) in CHARACTERS.iter().enumerate() { + let encoded = CharacterSet::encode(*c1) | CharacterSet::encode(*c2); + assert_eq!( + CharacterSet::decode_pair(encoded), + (CharacterSet::decode(i1 as u8), CharacterSet::decode(i2 as u8)) + ); + } + } + } + + #[test] + #[should_panic] + fn test_encode_invalid() { + CharacterSet::encode(b'A'); + } + + #[test] + #[should_panic] + fn test_decode_invalid() { + CharacterSet::decode(15); + } + + #[test] + #[should_panic] + fn test_decode_pair_invalid() { + CharacterSet::decode_pair(0b11111111); + } + + #[test] + fn test_clone() { + let character_set = CharacterSet::Empty; + let character_set_clone = character_set.clone(); + assert_eq!(character_set, character_set_clone); + } + + #[test] + fn test_copy() { + let character_set = CharacterSet::Empty; + let character_set_copy = character_set; + assert_eq!(character_set, character_set_copy); + } +} diff --git a/fa-compression/src/algorithm2/decode.rs b/fa-compression/src/algorithm2/decode.rs new file mode 100644 index 0000000..3835d01 --- /dev/null +++ b/fa-compression/src/algorithm2/decode.rs @@ -0,0 +1,103 @@ +//! This module provides a function to decode a byte array into a string representation of +//! annotations. + +use super::CompressionTable; + +/// Decodes a byte slice using a compression table and returns the corresponding string. +/// +/// # Arguments +/// +/// * `input` - The byte slice to decode. +/// * `compression_table` - The compression table used for decoding. +/// +/// # Returns +/// +/// The decoded string. +/// +/// # Examples +/// +/// ``` +/// use fa_compression::algorithm2::decode; +/// use fa_compression::algorithm2::CompressionTable; +/// +/// let input = &[0, 0, 0, 1, 0, 0]; +/// let mut compression_table = CompressionTable::new(); +/// compression_table.add_entry("IPR:IPR000001".to_string()); +/// compression_table.add_entry("IPR:IPR000002".to_string()); +/// +/// let decoded_string = decode(input, compression_table); +/// assert_eq!(decoded_string, "IPR:IPR000001;IPR:IPR000002"); +/// ``` +pub fn decode(input: &[u8], compression_table: CompressionTable) -> String { + if input.is_empty() { + return String::new(); + } + + let mut result = String::with_capacity(input.len() / 3 * 15); + for bytes in input.chunks_exact(3) { + // Convert the first 3 bytes to a u32 and use it as an index in the compression table + let index = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], 0]) as usize; + result.push_str(&compression_table[index].annotation); + result.push(';'); + } + + // Remove the trailing semicolon + result.pop(); + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + fn create_compresion_table() -> CompressionTable { + let mut table = CompressionTable::new(); + + table.add_entry("IPR:IPR000001".to_string()); + table.add_entry("IPR:IPR000002".to_string()); + table.add_entry("IPR:IPR000003".to_string()); + table.add_entry("IPR:IPR000004".to_string()); + table.add_entry("GO:0000001".to_string()); + table.add_entry("GO:0000002".to_string()); + table.add_entry("GO:0000003".to_string()); + table.add_entry("EC:1.1.1.-".to_string()); + table.add_entry("EC:2.12.3.7".to_string()); + table.add_entry("EC:2.2.-.-".to_string()); + + table + } + + #[test] + fn test_decode_empty() { + let table = create_compresion_table(); + assert_eq!(decode(&[], table), "") + } + + #[test] + fn test_decode_single_ec() { + let table = create_compresion_table(); + assert_eq!(decode(&[8, 0, 0], table), "EC:2.12.3.7"); + } + + #[test] + fn test_decode_single_go() { + let table = create_compresion_table(); + assert_eq!(decode(&[6, 0, 0], table), "GO:0000003"); + } + + #[test] + fn test_decode_single_ipr() { + let table = create_compresion_table(); + assert_eq!(decode(&[0, 0, 0], table), "IPR:IPR000001"); + } + + #[test] + fn test_decode_all() { + let table = create_compresion_table(); + assert_eq!( + decode(&[0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0], table), + "IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002" + ) + } +} diff --git a/fa-compression/src/algorithm2/encode.rs b/fa-compression/src/algorithm2/encode.rs new file mode 100644 index 0000000..d52844e --- /dev/null +++ b/fa-compression/src/algorithm2/encode.rs @@ -0,0 +1,97 @@ +//! This module contains the function to encode the input string into a compressed byte vector. + +use super::CompressionTable; + +/// Encodes the input string using the provided compression table. +/// +/// # Arguments +/// +/// * `input` - The input string to encode. +/// * `compression_table` - The compression table used for encoding. +/// +/// # Returns +/// +/// A compressed byte vector representing the encoded annotations. +/// +/// # Examples +/// +/// ``` +/// use fa_compression::algorithm2::encode; +/// use fa_compression::algorithm2::CompressionTable; +/// +/// let mut compression_table = CompressionTable::new(); +/// compression_table.add_entry("IPR:IPR000001".to_string()); +/// compression_table.add_entry("IPR:IPR000002".to_string()); +/// +/// let encoded = encode("IPR:IPR000001;IPR:IPR000002", compression_table); +/// assert_eq!(encoded, vec![0, 0, 0, 1, 0, 0]); +/// ``` +pub fn encode(input: &str, compression_table: CompressionTable) -> Vec { + if input.is_empty() { + return Vec::new(); + } + + let mut encoded: Vec = Vec::with_capacity(input.len() / 3); + for annotation in input.split(';') { + if let Some(index) = compression_table.index_of(annotation) { + encoded.extend_from_slice(&index.to_le_bytes()[0 .. 3]) + } + } + + encoded +} + +#[cfg(test)] +mod tests { + use super::*; + + fn create_compresion_table() -> CompressionTable { + let mut table = CompressionTable::new(); + + table.add_entry("IPR:IPR000001".to_string()); + table.add_entry("IPR:IPR000002".to_string()); + table.add_entry("IPR:IPR000003".to_string()); + table.add_entry("IPR:IPR000004".to_string()); + table.add_entry("GO:0000001".to_string()); + table.add_entry("GO:0000002".to_string()); + table.add_entry("GO:0000003".to_string()); + table.add_entry("EC:1.1.1.-".to_string()); + table.add_entry("EC:2.12.3.7".to_string()); + table.add_entry("EC:2.2.-.-".to_string()); + + table + } + + #[test] + fn test_encode_empty() { + let table = create_compresion_table(); + assert_eq!(encode("", table), vec![]) + } + + #[test] + fn test_encode_single_ec() { + let table = create_compresion_table(); + assert_eq!(encode("EC:2.12.3.7", table), vec![8, 0, 0]) + } + + #[test] + fn test_encode_single_go() { + let table = create_compresion_table(); + assert_eq!(encode("GO:0000003", table), vec![6, 0, 0]) + } + + #[test] + fn test_encode_single_ipr() { + let table = create_compresion_table(); + assert_eq!(encode("IPR:IPR000002", table), vec![1, 0, 0]) + } + + #[test] + fn test_encode_all() { + let table = create_compresion_table(); + assert_eq!( + encode("IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002", table), + vec![0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0] + ) + } +} diff --git a/fa-compression/src/algorithm2/mod.rs b/fa-compression/src/algorithm2/mod.rs new file mode 100644 index 0000000..be08fe4 --- /dev/null +++ b/fa-compression/src/algorithm2/mod.rs @@ -0,0 +1,146 @@ +//! The `fa-compression` crate provides functions to encode and decode annotations following a +//! specific format + +mod decode; +mod encode; + +use std::ops::Index; + +pub use decode::decode; +pub use encode::encode; + +/// Represents an entry in the compression table. +#[doc(hidden)] +pub struct CompressionTableEntry { + annotation: String +} + +/// Represents a compression table. +pub struct CompressionTable { + /// List of annotations in the compression table. + entries: Vec +} + +impl CompressionTable { + /// Creates a new compression table. + /// + /// # Returns + /// + /// An empty compression table. + /// + /// # Examples + /// + /// ``` + /// use fa_compression::algorithm2::CompressionTable; + /// + /// let table = CompressionTable::new(); + /// ``` + pub fn new() -> CompressionTable { + CompressionTable { + entries: Vec::new() + } + } + + /// Adds a new entry to the compression table. + /// + /// # Arguments + /// + /// * `annotation` - The annotation to add to the compression table. + /// + /// # Examples + /// + /// ``` + /// use fa_compression::algorithm2::CompressionTable; + /// + /// let mut table = CompressionTable::new(); + /// table.add_entry("IPR:IPR000001".to_string()); + /// table.add_entry("IPR:IPR000002".to_string()); + /// ``` + pub fn add_entry(&mut self, annotation: String) { + self.entries.push(CompressionTableEntry { + annotation + }); + } + + /// Returns the index of the given annotation in the compression table, if it exists. + fn index_of(&self, annotation: &str) -> Option { + self.entries + .iter() + .position(|entry| entry.annotation == annotation) + } +} + +impl Default for CompressionTable { + /// Creates a default compression table. + fn default() -> Self { + Self::new() + } +} + +impl Index for CompressionTable { + type Output = CompressionTableEntry; + + /// Returns a reference to the compression table entry at the given index. + fn index(&self, index: usize) -> &Self::Output { + &self.entries[index] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Creates a compression table with some predefined entries for testing. + fn create_compresion_table() -> CompressionTable { + let mut table = CompressionTable::new(); + + table.add_entry("IPR:IPR000001".to_string()); + table.add_entry("IPR:IPR000002".to_string()); + table.add_entry("GO:0000001".to_string()); + table.add_entry("GO:0000002".to_string()); + table.add_entry("EC:1.1.1.-".to_string()); + + table + } + + #[test] + fn test_default() { + assert_eq!(CompressionTable::default().entries.len(), 0); + } + + #[test] + fn test_add_entry() { + assert_eq!(create_compresion_table().entries.len(), 5); + } + + #[test] + fn test_index_of() { + let table = create_compresion_table(); + + assert_eq!(table.index_of("IPR:IPR000001"), Some(0)); + assert_eq!(table.index_of("IPR:IPR000002"), Some(1)); + assert_eq!(table.index_of("GO:0000001"), Some(2)); + assert_eq!(table.index_of("GO:0000002"), Some(3)); + assert_eq!(table.index_of("EC:1.1.1.-"), Some(4)); + } + + #[test] + fn test_index_of_not_found() { + let table = create_compresion_table(); + + assert_eq!(table.index_of("IPR:IPR000003"), None); + assert_eq!(table.index_of("GO:0000003"), None); + assert_eq!(table.index_of("EC:2.2.2.-"), None); + } + + #[test] + fn test_index() { + let table = create_compresion_table(); + + assert_eq!(table[0].annotation, "IPR:IPR000001"); + assert_eq!(table[1].annotation, "IPR:IPR000002"); + assert_eq!(table[2].annotation, "GO:0000001"); + assert_eq!(table[3].annotation, "GO:0000002"); + assert_eq!(table[4].annotation, "EC:1.1.1.-"); + } +} diff --git a/fa-compression/src/lib.rs b/fa-compression/src/lib.rs index 3a1d9b4..9efba21 100644 --- a/fa-compression/src/lib.rs +++ b/fa-compression/src/lib.rs @@ -1,258 +1,8 @@ -//! The `fa-compression` crate provides functions to encode and decode annotations following a -//! specific format +//! The `fa-compression` crate provides multiple algorithms for compressing and decompressing +//! functional annotations. The algorithms are designed to work with the output generated by the +//! Unipept index builder. -use std::ops::BitOr; +#![warn(missing_docs)] -mod decode; -mod encode; - -pub use decode::decode; -pub use encode::encode; - -/// Trait for encoding a value into a character set. -trait Encode { - /// Encodes the given value into a character set. - /// - /// # Arguments - /// - /// * `value` - The value to be encoded. - /// - /// # Returns - /// - /// The encoded character set. - fn encode(value: u8) -> CharacterSet; -} - -/// Trait for decoding a value from a character set. -trait Decode { - /// Decodes the given value from a character set into a character. - /// - /// # Arguments - /// - /// * `value` - The value to be decoded. - /// - /// # Returns - /// - /// The decoded character. - fn decode(value: u8) -> char; - - /// Decodes a pair of values from a character set into a pair of characters. - /// - /// # Arguments - /// - /// * `value` - The value to be decoded. - /// - /// # Returns - /// - /// A tuple containing the decoded characters. - fn decode_pair(value: u8) -> (char, char) { - (Self::decode(value >> 4), Self::decode(value & 0b1111)) - } -} - -/// Enum representing the set of characters that can be encoded. -#[repr(u8)] -#[cfg_attr(test, derive(Clone, Copy))] -#[derive(PartialEq, Eq, Debug)] -enum CharacterSet { - /// Empty placeholder character - Empty, - - /// Numeric characters - Zero, - One, - Two, - Three, - Four, - Five, - Six, - Seven, - Eight, - Nine, - - /// Special Enzyme Commission characters - Dash, - Point, - - /// Different annotation type separator - Comma, - - /// Annotation separator - Semicolon -} - -impl Encode for CharacterSet { - /// Encodes the given value into a character set. - /// - /// # Arguments - /// - /// * `value` - The value to be encoded. - /// - /// # Returns - /// - /// The encoded character set. - fn encode(value: u8) -> CharacterSet { - match value { - b'$' => CharacterSet::Empty, - b'0' => CharacterSet::Zero, - b'1' => CharacterSet::One, - b'2' => CharacterSet::Two, - b'3' => CharacterSet::Three, - b'4' => CharacterSet::Four, - b'5' => CharacterSet::Five, - b'6' => CharacterSet::Six, - b'7' => CharacterSet::Seven, - b'8' => CharacterSet::Eight, - b'9' => CharacterSet::Nine, - b'-' => CharacterSet::Dash, - b'.' => CharacterSet::Point, - b',' => CharacterSet::Comma, - b';' => CharacterSet::Semicolon, - _ => panic!("Invalid character") - } - } -} - -impl Decode for CharacterSet { - /// Decodes the given value from a character set into a character. - /// - /// # Arguments - /// - /// * `value` - The value to be decoded. - /// - /// # Returns - /// - /// The decoded character. - fn decode(value: u8) -> char { - match value { - 0 => '$', - 1 => '0', - 2 => '1', - 3 => '2', - 4 => '3', - 5 => '4', - 6 => '5', - 7 => '6', - 8 => '7', - 9 => '8', - 10 => '9', - 11 => '-', - 12 => '.', - 13 => ',', - 14 => ';', - _ => panic!("Invalid character") - } - } -} - -impl BitOr for CharacterSet { - type Output = u8; - - /// Performs a bitwise OR operation between two character sets. - /// - /// # Arguments - /// - /// * `self` - The left-hand side character set. - /// * `rhs` - The right-hand side character set. - /// - /// # Returns - /// - /// The result of the bitwise OR operation. - fn bitor(self, rhs: Self) -> Self::Output { - ((self as u8) << 4) | rhs as u8 - } -} - -#[cfg(test)] -mod tests { - use super::*; - - static CHARACTERS: [u8; 15] = - [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b',', b';']; - - static CHARACTER_SETS: [CharacterSet; 15] = [ - CharacterSet::Empty, - CharacterSet::Zero, - CharacterSet::One, - CharacterSet::Two, - CharacterSet::Three, - CharacterSet::Four, - CharacterSet::Five, - CharacterSet::Six, - CharacterSet::Seven, - CharacterSet::Eight, - CharacterSet::Nine, - CharacterSet::Dash, - CharacterSet::Point, - CharacterSet::Comma, - CharacterSet::Semicolon - ]; - - #[test] - fn test_or() { - for i in 0 .. CHARACTERS.len() { - for j in 0 .. CHARACTERS.len() { - assert_eq!(CHARACTER_SETS[i] | CHARACTER_SETS[j], ((i as u8) << 4) | (j as u8)); - } - } - } - - #[test] - fn test_encode() { - for i in 0 .. CHARACTERS.len() { - assert_eq!(CHARACTER_SETS[i], CharacterSet::encode(CHARACTERS[i])); - } - } - - #[test] - fn test_decode() { - for (i, c) in CHARACTERS.iter().enumerate() { - assert_eq!(CharacterSet::decode(i as u8), *c as char); - } - } - - #[test] - fn test_decode_pair() { - for (i1, c1) in CHARACTERS.iter().enumerate() { - for (i2, c2) in CHARACTERS.iter().enumerate() { - let encoded = CharacterSet::encode(*c1) | CharacterSet::encode(*c2); - assert_eq!( - CharacterSet::decode_pair(encoded), - (CharacterSet::decode(i1 as u8), CharacterSet::decode(i2 as u8)) - ); - } - } - } - - #[test] - #[should_panic] - fn test_encode_invalid() { - CharacterSet::encode(b'A'); - } - - #[test] - #[should_panic] - fn test_decode_invalid() { - CharacterSet::decode(15); - } - - #[test] - #[should_panic] - fn test_decode_pair_invalid() { - CharacterSet::decode_pair(0b11111111); - } - - #[test] - fn test_clone() { - let character_set = CharacterSet::Empty; - let character_set_clone = character_set.clone(); - assert_eq!(character_set, character_set_clone); - } - - #[test] - fn test_copy() { - let character_set = CharacterSet::Empty; - let character_set_copy = character_set; - assert_eq!(character_set, character_set_copy); - } -} +pub mod algorithm1; +pub mod algorithm2;