From fb6e77a2f87f52aeb0b035c0579768b72200788f Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Wed, 11 Sep 2024 08:50:04 +0200 Subject: [PATCH] Represent chars in protein text with 5 bits, tests don't work yet --- Cargo.lock | 10 + bitarray/src/binary.rs | 8 +- bitarray/src/lib.rs | 18 +- sa-index/Cargo.toml | 1 + sa-index/src/lib.rs | 10 +- sa-index/src/sa_searcher.rs | 95 +++--- sa-index/src/suffix_to_protein_index.rs | 14 +- sa-mappings/Cargo.toml | 2 + sa-mappings/src/proteins.rs | 35 +-- text-compression/Cargo.toml | 9 + text-compression/src/lib.rs | 391 ++++++++++++++++++++++++ 11 files changed, 506 insertions(+), 87 deletions(-) create mode 100644 text-compression/Cargo.toml create mode 100644 text-compression/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index c29abc3..9d81263 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1127,15 +1127,18 @@ dependencies = [ "serde", "serde_json", "tempdir", + "text-compression", ] [[package]] name = "sa-mappings" version = "0.1.0" dependencies = [ + "bitarray", "bytelines", "fa-compression", "tempdir", + "text-compression", ] [[package]] @@ -1275,6 +1278,13 @@ dependencies = [ "remove_dir_all", ] +[[package]] +name = "text-compression" +version = "0.1.0" +dependencies = [ + "bitarray", +] + [[package]] name = "tinytemplate" version = "1.2.1" diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index e7265cd..a8084d1 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -159,10 +159,10 @@ mod tests { #[test] fn test_write_binary() { let mut bitarray = BitArray::with_capacity(4, 40); - bitarray.set(0, 0x1234567890); - bitarray.set(1, 0xabcdef0123); - bitarray.set(2, 0x4567890abc); - bitarray.set(3, 0xdef0123456); + bitarray.set(0, 0x1234567890_u64); + bitarray.set(1, 0xabcdef0123_u64); + bitarray.set(2, 0x4567890abc_u64); + bitarray.set(3, 0xdef0123456_u64); let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 655d17e..fe7b532 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -19,7 +19,7 @@ pub struct BitArray { /// The length of the bit array. len: usize, /// The number of bits in a single element of the data vector. - bits_per_value: usize + bits_per_value: usize, } impl BitArray { @@ -39,7 +39,7 @@ impl BitArray { data: vec![0; capacity * bits_per_value / 64 + extra], mask: (1 << bits_per_value) - 1, len: capacity, - bits_per_value + bits_per_value, } } @@ -85,6 +85,7 @@ impl BitArray { /// * `index` - The index of the value to set. /// * `value` - The value to set at the specified index. pub fn set(&mut self, index: usize, value: u64) { + let value: u64 = value.into(); let start_block = index * self.bits_per_value / 64; let start_block_offset = index * self.bits_per_value % 64; @@ -142,6 +143,11 @@ impl BitArray { pub fn clear(&mut self) { self.data.iter_mut().for_each(|x| *x = 0); } + + pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] { + &self.data[start_slice..end_slice] + } + } /// Writes the data to a writer in a binary format using a bit array. This function is helpfull @@ -257,10 +263,10 @@ mod tests { fn test_bitarray_set() { let mut bitarray = BitArray::with_capacity(4, 40); - bitarray.set(0, 0b0001110011111010110001000111111100110010); - bitarray.set(1, 0b1100001001010010011000010100110111001001); - bitarray.set(2, 0b1111001101001101101101101011101001010001); - bitarray.set(3, 0b0000100010010001010001001110101110011100); + bitarray.set(0, 0b0001110011111010110001000111111100110010_u64); + bitarray.set(1, 0b1100001001010010011000010100110111001001_u64); + bitarray.set(2, 0b1111001101001101101101101011101001010001_u64); + bitarray.set(3, 0b0000100010010001010001001110101110011100_u64); assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]); } diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index de57fc9..25dda76 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] } rayon = "1.8.1" serde = { version = "1.0.197", features = ["derive"] } sa-mappings = { path = "../sa-mappings" } +text-compression = { path = "../text-compression" } bitarray = { path = "../bitarray" } serde_json = "1.0.116" diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index f276906..53f5348 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -115,11 +115,11 @@ mod tests { #[test] fn test_suffix_array_compressed() { let mut bitarray = BitArray::with_capacity(5, 40); - bitarray.set(0, 1); - bitarray.set(1, 2); - bitarray.set(2, 3); - bitarray.set(3, 4); - bitarray.set(4, 5); + bitarray.set(0, 1 as u64); + bitarray.set(1, 2 as u64); + bitarray.set(2, 3 as u64); + bitarray.set(3, 4 as u64); + bitarray.set(4, 5 as u64); let sa = SuffixArray::Compressed(bitarray, 1); assert_eq!(sa.len(), 5); diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index d09c704..7f60cbb 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,6 +1,7 @@ use std::{cmp::min, ops::Deref}; use sa_mappings::proteins::{Protein, Proteins}; +use text_compression::ProteinTextSlice; use crate::{ sa_searcher::BoundSearch::{Maximum, Minimum}, @@ -75,7 +76,7 @@ pub struct SparseSearcher(Searcher); impl SparseSearcher { pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); Self(searcher) } @@ -93,7 +94,7 @@ pub struct DenseSearcher(Searcher); impl DenseSearcher { pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { - let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); Self(searcher) } @@ -176,12 +177,12 @@ impl Searcher { // match as long as possible while index_in_search_string < search_string.len() - && index_in_suffix < self.proteins.input_string.len() - && (search_string[index_in_search_string] == self.proteins.input_string[index_in_suffix] + && index_in_suffix < self.proteins.text.len() + && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix) as u8 || (search_string[index_in_search_string] == b'L' - && self.proteins.input_string[index_in_suffix] == b'I') + && self.proteins.text.get(index_in_suffix) as u8 == b'I') || (search_string[index_in_search_string] == b'I' - && self.proteins.input_string[index_in_suffix] == b'L')) + && self.proteins.text.get(index_in_suffix) as u8 == b'L')) { index_in_suffix += 1; index_in_search_string += 1; @@ -191,7 +192,7 @@ impl Searcher { if !search_string.is_empty() { if index_in_search_string == search_string.len() { is_cond_or_equal = true - } else if index_in_suffix < self.proteins.input_string.len() { + } else if index_in_suffix < self.proteins.text.len() { // in our index every L was replaced by a I, so we need to replace them if we want // to search in the right direction let peptide_char = if search_string[index_in_search_string] == b'L' { @@ -200,10 +201,10 @@ impl Searcher { search_string[index_in_search_string] }; - let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' { + let protein_char = if self.proteins.text.get(index_in_suffix) as u8 == b'L' { b'I' } else { - self.proteins.input_string[index_in_suffix] + self.proteins.text.get(index_in_suffix) as u8 }; is_cond_or_equal = condition_check(peptide_char, protein_char); @@ -340,16 +341,14 @@ impl Searcher { // check at all if suffix >= skip && ((skip == 0 - || Self::check_prefix( - current_search_string_prefix, - &self.proteins.input_string[suffix - skip..suffix], - equate_il - )) - && Self::check_suffix( + || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix) + .equals_slice(current_search_string_prefix, equate_il)) + && + Self::check_suffix( skip, il_locations_current_suffix, current_search_string_suffix, - &self.proteins.input_string[suffix..suffix + search_string.len() - skip], + ProteinTextSlice::new(&self.proteins.text, suffix, suffix + search_string.len() - skip), equate_il )) { @@ -419,19 +418,13 @@ impl Searcher { skip: usize, il_locations: &[usize], search_string: &[u8], - index_string: &[u8], + text_slice: ProteinTextSlice, equate_il: bool ) -> bool { if equate_il { true } else { - for &il_location in il_locations { - let index = il_location - skip; - if search_string[index] != index_string[index] { - return false; - } - } - true + text_slice.check_il_locations(skip, il_locations, search_string) } } @@ -459,6 +452,7 @@ impl Searcher { #[cfg(test)] mod tests { use sa_mappings::proteins::{Protein, Proteins}; + use text_compression::ProteinText; use crate::{ sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, @@ -487,9 +481,11 @@ mod tests { } fn get_example_proteins() -> Proteins { - let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes(); + let input_string = "AI-BLACVAA-AC-KCRLZ$"; + let text = ProteinText::from_string(input_string); + Proteins { - input_string: text, + text, proteins: vec![ Protein { uniprot_id: String::new(), @@ -520,7 +516,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'A' @@ -541,7 +537,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search suffix 'VAA' @@ -558,7 +554,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); let bounds_res = searcher.search_bounds(&[b'I']); @@ -574,7 +570,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'RIZ' with equal I and L @@ -589,10 +585,11 @@ mod tests { // test edge case where an I or L is the first index in the sparse SA. #[test] fn test_l_first_index_in_sa() { - let text = "LMOXZ$".to_string().into_bytes(); + let input_string = "LMOXZ$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -601,7 +598,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L @@ -611,10 +608,11 @@ mod tests { #[test] fn test_il_missing_matches() { - let text = "AAILLL$".to_string().into_bytes(); + let input_string = "AAILLL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -623,7 +621,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); @@ -632,19 +630,20 @@ mod tests { #[test] fn test_il_duplication() { - let text = "IIIILL$".to_string().into_bytes(); + let input_string = "IIIILL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, functional_annotations: vec![] }] }; - + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); @@ -653,10 +652,11 @@ mod tests { #[test] fn test_il_suffix_check() { - let text = "IIIILL$".to_string().into_bytes(); - + let input_string = "IIIILL$"; + let text = ProteinText::from_string(input_string); + let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -665,7 +665,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search all places where II is in the string IIIILL, but with a sparse SA @@ -676,10 +676,11 @@ mod tests { #[test] fn test_il_duplication2() { - let text = "IILLLL$".to_string().into_bytes(); + let input_string = "IILLLL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -688,7 +689,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 121b569..6aed362 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -2,6 +2,7 @@ use clap::ValueEnum; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; use crate::Nullable; +use text_compression::ProteinText; /// Enum used to define the commandline arguments and choose which index style is used #[derive(ValueEnum, Clone, Debug, PartialEq)] @@ -66,10 +67,10 @@ impl DenseSuffixToProtein { /// # Returns /// /// Returns a new DenseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { + pub fn new(text: &ProteinText) -> Self { let mut current_protein_index: u32 = 0; let mut suffix_index_to_protein: Vec = vec![]; - for &char in text.iter() { + for char in text.iter() { if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { current_protein_index += 1; suffix_index_to_protein.push(u32::NULL); @@ -92,9 +93,9 @@ impl SparseSuffixToProtein { /// # Returns /// /// Returns a new SparseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { + pub fn new(text: &ProteinText) -> Self { let mut suffix_index_to_protein: Vec = vec![0]; - for (index, &char) in text.iter().enumerate() { + for (index, char) in text.iter().enumerate() { if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { suffix_index_to_protein.push(index as i64 + 1); } @@ -108,6 +109,7 @@ impl SparseSuffixToProtein { mod tests { use clap::ValueEnum; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; + use text_compression::ProteinText; use crate::{ suffix_to_protein_index::{ @@ -116,10 +118,10 @@ mod tests { Nullable }; - fn build_text() -> Vec { + fn build_text() -> ProteinText { let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char)); text.push(TERMINATION_CHARACTER as char); - text.into_bytes() + ProteinText::from_string(&text) } #[test] diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml index b20a2bf..d255f7c 100644 --- a/sa-mappings/Cargo.toml +++ b/sa-mappings/Cargo.toml @@ -11,3 +11,5 @@ tempdir = "0.3.7" [dependencies] fa-compression = { path = "../fa-compression" } bytelines = "2.5.0" +bitarray = { path = "../bitarray" } +text-compression = { path = "../text-compression" } diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index f2b24cc..ca3bdd7 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -5,6 +5,7 @@ use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8}; use bytelines::ByteLines; use fa_compression::algorithm1::{decode, encode}; +use text_compression::ProteinText; /// The separation character used in the input string pub static SEPARATION_CHARACTER: u8 = b'-'; @@ -28,7 +29,7 @@ pub struct Protein { /// A struct that represents a collection of proteins pub struct Proteins { /// The input string containing all proteins - pub input_string: Vec, + pub text: ProteinText, /// The proteins in the input string pub proteins: Vec @@ -86,12 +87,13 @@ impl Proteins { input_string.pop(); input_string.push(TERMINATION_CHARACTER.into()); - input_string.shrink_to_fit(); proteins.shrink_to_fit(); - Ok(Self { input_string: input_string.into_bytes(), proteins }) + + let text = ProteinText::from_string(&input_string); + Ok(Self { text, proteins }) } - /// Creates a `vec` which represents all the proteins concatenated from the database file + /// Creates a `ProteinText` which represents all the proteins concatenated from the database file /// /// # Arguments /// * `file` - The path to the database file @@ -99,12 +101,12 @@ impl Proteins { /// /// # Returns /// - /// Returns a `Result` containing the `Vec` + /// Returns a `Result` containing the `ProteinText` /// /// # Errors /// /// Returns a `Box` if an error occurred while reading the database file - pub fn try_from_database_file_without_annotations(database_file: &str) -> Result, Box> { + pub fn try_from_database_file_without_annotations(database_file: &str) -> Result> { let mut input_string: String = String::new(); let file = File::open(database_file)?; @@ -123,11 +125,10 @@ impl Proteins { input_string.push(SEPARATION_CHARACTER.into()); } - input_string.pop(); - input_string.push(TERMINATION_CHARACTER.into()); + let text = ProteinText::from_string(&input_string); + + Ok(text) - input_string.shrink_to_fit(); - Ok(input_string.into_bytes()) } } @@ -181,8 +182,10 @@ mod tests { #[test] fn test_new_proteins() { + let input_string = "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG"; + let text = ProteinText::from_string(&input_string); let proteins = Proteins { - input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes().to_vec(), + text, proteins: vec![ Protein { uniprot_id: "P12345".to_string(), @@ -197,7 +200,6 @@ mod tests { ] }; - assert_eq!(proteins.input_string, "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes()); assert_eq!(proteins.proteins.len(), 2); assert_eq!(proteins[0].uniprot_id, "P12345"); assert_eq!(proteins[0].taxon_id, 1); @@ -245,12 +247,7 @@ mod tests { let proteins = Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap()).unwrap(); - let sep_char = SEPARATION_CHARACTER as char; - let end_char = TERMINATION_CHARACTER as char; - let expected = format!( - "MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}", - sep_char, sep_char, sep_char, end_char - ); - assert_eq!(proteins, expected.as_bytes()); + let expected = 'L' as u8; + assert_eq!(proteins.get(4), expected); } } diff --git a/text-compression/Cargo.toml b/text-compression/Cargo.toml new file mode 100644 index 0000000..c312a3c --- /dev/null +++ b/text-compression/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "text-compression" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitarray = { path = "../bitarray" } diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs new file mode 100644 index 0000000..60b2463 --- /dev/null +++ b/text-compression/src/lib.rs @@ -0,0 +1,391 @@ +use std::{ + error::Error, + io::{BufRead, Write} +}; +use std::collections::HashMap; + +use bitarray::{data_to_writer, Binary, BitArray}; + +pub struct ProteinText { + bit_array: BitArray, + char_to_5bit: HashMap, + bit5_to_char: Vec, +} + +impl ProteinText { + + fn create_char_to_5bit_hashmap() -> HashMap { + let mut hashmap = HashMap::::new(); + for (i, c) in "ACDEFGHIKLMNPQRSTVWY-".chars().enumerate() { + hashmap.insert(c as u8, i as u8); + } + + hashmap + } + + fn create_bit5_to_char() -> Vec { + let mut vec = Vec::::new(); + for c in "ACDEFGHIKLMNPQRSTVWY-".chars() { + vec.push(c as u8); + } + vec + } + + pub fn from_string(input_string: &str) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + let mut bit_array = BitArray::with_capacity(input_string.len(), 5); + for (i, c) in input_string.chars().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + Self { bit_array, char_to_5bit, bit5_to_char } + } + + pub fn from_vec(input_vec: &Vec) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + let mut bit_array = BitArray::with_capacity(input_vec.len(), 5); + for (i, e) in input_vec.iter().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + Self { bit_array, char_to_5bit, bit5_to_char } + } + + pub fn new(bit_array: BitArray) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + Self { bit_array, char_to_5bit, bit5_to_char } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self::new(BitArray::with_capacity(capacity, 5)) + } + + pub fn get(&self, index: usize) -> u8 { + let char_5bit = self.bit_array.get(index) as usize; + self.bit5_to_char[char_5bit] + } + + pub fn set(&mut self, index: usize, value: u8) { + let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet"); + self.bit_array.set(index, char_5bit as u64); + } + + pub fn len(&self) -> usize { + self.bit_array.len() + } + + pub fn is_empty(&self) -> bool { + self.bit_array.len() == 0 + } + + /// Clears the `BitArray`, setting all bits to 0. + pub fn clear(&mut self) { + self.bit_array.clear() + } + + pub fn iter(&self) -> ProteinTextIterator { + ProteinTextIterator {protein_text: self, index: 0, } + } + +} + +pub struct ProteinTextSlice<'a> { + text: &'a ProteinText, + start: usize, // included + end: usize, // excluded +} + +impl<'a> ProteinTextSlice<'a> { + + pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice { + Self {text, start, end } + } + + pub fn get(&self, index: usize) -> u8 { + self.text.get(self.start + index) + } + + pub fn len(&self) -> usize { + self.end - self.start + } + + #[inline] + pub fn equals_slice(&self, other: &[u8], equate_il: bool) -> bool { + if equate_il { + other.iter().zip(self.iter()).all(|(&search_character, text_character)| { + search_character == text_character + || (search_character == b'I' && text_character == b'L') + || (search_character == b'L' && text_character == b'I') + }) + } else { + other.iter().zip(self.iter()).all(|(&search_character, text_character)| search_character == text_character) + } + } + + pub fn check_il_locations( + &self, + skip: usize, + il_locations: &[usize], + search_string: &[u8], + ) -> bool { + for &il_location in il_locations { + let index = il_location - skip; + if search_string[index] != self.get(index) { + return false; + } + } + true + } + + pub fn iter(&self) -> ProteinTextSliceIterator { + ProteinTextSliceIterator {text_slice: self, index: 0, } + } +} + +pub struct ProteinTextIterator<'a> { + protein_text: &'a ProteinText, + index: usize, +} + +pub struct ProteinTextSliceIterator<'a> { + text_slice: &'a ProteinTextSlice<'a>, + index: usize, +} + +impl<'a> Iterator for ProteinTextSliceIterator<'a> { + + type Item = u8; + + fn next(&mut self) -> Option { + if self.index >= self.text_slice.len() { + return None; + } + + self.index += 1; + Some(self.text_slice.get(self.index - 1)) + } +} + +impl<'a> Iterator for ProteinTextIterator<'a> { + + type Item = u8; + + fn next(&mut self) -> Option { + if self.index >= self.protein_text.len() { + return None; + } + + self.index += 1; + Some(self.protein_text.get(self.index - 1)) + } +} + +/// Writes the compressed text to a writer. +/// +/// # Arguments +/// +/// * `text` - The text to be compressed. +/// * `writer` - The writer to which the compressed text will be written. +/// +/// # Errors +/// +/// Returns an error if writing to the writer fails. +pub fn dump_compressed_text( + text: Vec, + writer: &mut impl Write +) -> Result<(), Box> { + let bits_per_value = 5; + + // Write the flags to the writer + // 00000001 indicates that the text is compressed + writer + .write(&[bits_per_value as u8]) + .map_err(|_| "Could not write the required bits to the writer")?; + + // Write the size of the text to the writer + writer + .write(&(text.len() as u64).to_le_bytes()) + .map_err(|_| "Could not write the size of the text to the writer")?; + + // Compress the text and write it to the writer + let text_writer: Vec = text.iter().map(|item| ::from(*item)).collect(); + data_to_writer(text_writer, bits_per_value, 8 * 1024, writer) + .map_err(|_| "Could not write the compressed text to the writer")?; + + Ok(()) +} + +/// Load the compressed text from a reader. +/// +/// # Arguments +/// +/// * `reader` - The reader from which the compressed text will be read. +/// +/// # Errors +/// +/// Returns an error if reading from the reader fails. +pub fn load_compressed_text( + reader: &mut impl BufRead +) -> Result> { + let bits_per_value: usize = 5; + // Read the size of the text from the binary file (8 bytes) + let mut size_buffer = [0_u8; 8]; + reader + .read_exact(&mut size_buffer) + .map_err(|_| "Could not read the size of the text from the binary file")?; + let size = u64::from_le_bytes(size_buffer) as usize; + + // Read the compressed text from the binary file + let mut compressed_text = BitArray::with_capacity(size, bits_per_value); + compressed_text + .read_binary(reader) + .map_err(|_| "Could not read the compressed text from the binary file")?; + + Ok(ProteinText::new(compressed_text)) +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + + pub struct FailingWriter { + /// The number of times the write function can be called before it fails. + pub valid_write_count: usize + } + + impl Write for FailingWriter { + fn write(&mut self, _: &[u8]) -> Result { + if self.valid_write_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed")); + } + + self.valid_write_count -= 1; + Ok(1) + } + + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + + pub struct FailingReader { + /// The number of times the read function can be called before it fails. + pub valid_read_count: usize + } + + impl Read for FailingReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.valid_read_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed")); + } + + self.valid_read_count -= 1; + Ok(buf.len()) + } + } + + impl BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Ok(&[]) + } + + fn consume(&mut self, _: usize) {} + } + + #[test] + fn test_dump_compressed_text() { + let text: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + + let mut writer = vec![]; + dump_compressed_text(text, &mut writer).unwrap(); + + assert_eq!(writer, vec![ + // bits per value + 5, // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8 + ]); + } + + #[test] + #[should_panic(expected = "Could not write the required bits to the writer")] + fn test_dump_compressed_text_fail_required_bits() { + let mut writer = FailingWriter { valid_write_count: 0 }; + + dump_compressed_text(vec![], &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the size of the text to the writer")] + fn test_dump_compressed_text_fail_size() { + let mut writer = FailingWriter { valid_write_count: 1 }; + + dump_compressed_text(vec![], &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the compressed text to the writer")] + fn test_dump_compressed_text_fail_compressed_text() { + let mut writer = FailingWriter { valid_write_count: 3 }; + + dump_compressed_text(vec![1], &mut writer).unwrap(); + } + + #[test] + fn test_load_compressed_text() { + let data = vec![ + // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8 + ]; + + let mut reader = std::io::BufReader::new(&data[..]); + let compressed_text = load_compressed_text(&mut reader).unwrap(); + + for i in 0..10 { + assert_eq!(compressed_text.get(i), i as u8 + 1); + } + } + + #[test] + #[should_panic(expected = "Could not read the size of the text from the binary file")] + fn test_load_compressed_text_fail_size() { + let mut reader = FailingReader { valid_read_count: 0 }; + + load_compressed_text(&mut reader).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the compressed text from the binary file")] + fn test_load_compressed_text_fail_compressed_text() { + let mut reader = FailingReader { valid_read_count: 2 }; + + load_compressed_text(&mut reader).unwrap(); + } + + #[test] + fn test_failing_writer() { + let mut writer = FailingWriter { valid_write_count: 0 }; + assert!(writer.flush().is_ok()); + assert!(writer.write(&[0]).is_err()); + } + + #[test] + fn test_failing_reader() { + let mut reader = FailingReader { valid_read_count: 0 }; + let right_buffer: [u8; 0] = []; + assert_eq!(reader.fill_buf().unwrap(), &right_buffer); + assert_eq!(reader.consume(0), ()); + let mut buffer = [0_u8; 1]; + assert!(reader.read(&mut buffer).is_err()); + } +}