diff --git a/Cargo.lock b/Cargo.lock index c29abc3..9d81263 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1127,15 +1127,18 @@ dependencies = [ "serde", "serde_json", "tempdir", + "text-compression", ] [[package]] name = "sa-mappings" version = "0.1.0" dependencies = [ + "bitarray", "bytelines", "fa-compression", "tempdir", + "text-compression", ] [[package]] @@ -1275,6 +1278,13 @@ dependencies = [ "remove_dir_all", ] +[[package]] +name = "text-compression" +version = "0.1.0" +dependencies = [ + "bitarray", +] + [[package]] name = "tinytemplate" version = "1.2.1" diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index e7265cd..a8084d1 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -159,10 +159,10 @@ mod tests { #[test] fn test_write_binary() { let mut bitarray = BitArray::with_capacity(4, 40); - bitarray.set(0, 0x1234567890); - bitarray.set(1, 0xabcdef0123); - bitarray.set(2, 0x4567890abc); - bitarray.set(3, 0xdef0123456); + bitarray.set(0, 0x1234567890_u64); + bitarray.set(1, 0xabcdef0123_u64); + bitarray.set(2, 0x4567890abc_u64); + bitarray.set(3, 0xdef0123456_u64); let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 655d17e..901b395 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -85,6 +85,7 @@ impl BitArray { /// * `index` - The index of the value to set. /// * `value` - The value to set at the specified index. pub fn set(&mut self, index: usize, value: u64) { + let value: u64 = value; let start_block = index * self.bits_per_value / 64; let start_block_offset = index * self.bits_per_value % 64; @@ -142,11 +143,14 @@ impl BitArray { pub fn clear(&mut self) { self.data.iter_mut().for_each(|x| *x = 0); } + + pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] { + &self.data[start_slice..end_slice] + } } -/// Writes the data to a writer in a binary format using a bit array. This function is helpfull -/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the -/// specified capacity, so memory usage is minimized. +/// Writes the data to a writer in a binary format using a bit array. The data is written +/// in chunks of the specified capacity, so memory usage is minimized. /// /// # Arguments /// @@ -257,10 +261,10 @@ mod tests { fn test_bitarray_set() { let mut bitarray = BitArray::with_capacity(4, 40); - bitarray.set(0, 0b0001110011111010110001000111111100110010); - bitarray.set(1, 0b1100001001010010011000010100110111001001); - bitarray.set(2, 0b1111001101001101101101101011101001010001); - bitarray.set(3, 0b0000100010010001010001001110101110011100); + bitarray.set(0, 0b0001110011111010110001000111111100110010_u64); + bitarray.set(1, 0b1100001001010010011000010100110111001001_u64); + bitarray.set(2, 0b1111001101001101101101101011101001010001_u64); + bitarray.set(3, 0b0000100010010001010001001110101110011100_u64); assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]); } diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 98a1414..01cc3c4 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -21,7 +21,7 @@ fn main() { eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - let mut data = Proteins::try_from_database_file_without_annotations(&database_file) + let mut data = Proteins::try_from_database_file_uncompressed(&database_file) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!( "✅ Successfully loaded the proteins in {} seconds!", diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index de57fc9..25dda76 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] } rayon = "1.8.1" serde = { version = "1.0.197", features = ["derive"] } sa-mappings = { path = "../sa-mappings" } +text-compression = { path = "../text-compression" } bitarray = { path = "../bitarray" } serde_json = "1.0.116" diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index f276906..53f5348 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -115,11 +115,11 @@ mod tests { #[test] fn test_suffix_array_compressed() { let mut bitarray = BitArray::with_capacity(5, 40); - bitarray.set(0, 1); - bitarray.set(1, 2); - bitarray.set(2, 3); - bitarray.set(3, 4); - bitarray.set(4, 5); + bitarray.set(0, 1 as u64); + bitarray.set(1, 2 as u64); + bitarray.set(2, 3 as u64); + bitarray.set(3, 4 as u64); + bitarray.set(4, 5 as u64); let sa = SuffixArray::Compressed(bitarray, 1); assert_eq!(sa.len(), 5); diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index d09c704..e9590c8 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,6 +1,7 @@ use std::{cmp::min, ops::Deref}; use sa_mappings::proteins::{Protein, Proteins}; +use text_compression::ProteinTextSlice; use crate::{ sa_searcher::BoundSearch::{Maximum, Minimum}, @@ -75,7 +76,7 @@ pub struct SparseSearcher(Searcher); impl SparseSearcher { pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); Self(searcher) } @@ -93,7 +94,7 @@ pub struct DenseSearcher(Searcher); impl DenseSearcher { pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { - let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); Self(searcher) } @@ -176,12 +177,10 @@ impl Searcher { // match as long as possible while index_in_search_string < search_string.len() - && index_in_suffix < self.proteins.input_string.len() - && (search_string[index_in_search_string] == self.proteins.input_string[index_in_suffix] - || (search_string[index_in_search_string] == b'L' - && self.proteins.input_string[index_in_suffix] == b'I') - || (search_string[index_in_search_string] == b'I' - && self.proteins.input_string[index_in_suffix] == b'L')) + && index_in_suffix < self.proteins.text.len() + && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix) + || (search_string[index_in_search_string] == b'L' && self.proteins.text.get(index_in_suffix) == b'I') + || (search_string[index_in_search_string] == b'I' && self.proteins.text.get(index_in_suffix) == b'L')) { index_in_suffix += 1; index_in_search_string += 1; @@ -191,7 +190,7 @@ impl Searcher { if !search_string.is_empty() { if index_in_search_string == search_string.len() { is_cond_or_equal = true - } else if index_in_suffix < self.proteins.input_string.len() { + } else if index_in_suffix < self.proteins.text.len() { // in our index every L was replaced by a I, so we need to replace them if we want // to search in the right direction let peptide_char = if search_string[index_in_search_string] == b'L' { @@ -200,10 +199,10 @@ impl Searcher { search_string[index_in_search_string] }; - let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' { + let protein_char = if self.proteins.text.get(index_in_suffix) == b'L' { b'I' } else { - self.proteins.input_string[index_in_suffix] + self.proteins.text.get(index_in_suffix) }; is_cond_or_equal = condition_check(peptide_char, protein_char); @@ -340,16 +339,14 @@ impl Searcher { // check at all if suffix >= skip && ((skip == 0 - || Self::check_prefix( - current_search_string_prefix, - &self.proteins.input_string[suffix - skip..suffix], - equate_il - )) - && Self::check_suffix( + || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix) + .equals_slice(current_search_string_prefix, equate_il)) // Check the prefix + && + Self::check_suffix( skip, il_locations_current_suffix, current_search_string_suffix, - &self.proteins.input_string[suffix..suffix + search_string.len() - skip], + ProteinTextSlice::new(&self.proteins.text, suffix, suffix + search_string.len() - skip), equate_il )) { @@ -373,31 +370,6 @@ impl Searcher { } } - /// Returns true of the prefixes are the same - /// if `equate_il` is set to true, L and I are considered the same - /// - /// # Arguments - /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched - /// * `index_prefix` - The unchecked prefix from the protein from the suffix array - /// * `equate_il` - True if we want to equate I and L during search, otherwise false - /// - /// # Returns - /// - /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise - /// false - #[inline] - fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool { - if equate_il { - search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| { - search_character == index_character - || (search_character == b'I' && index_character == b'L') - || (search_character == b'L' && index_character == b'I') - }) - } else { - search_string_prefix == index_prefix - } - } - /// Returns true of the search_string and index_string are equal /// This is automatically true if `equate_il` is set to true, since there matched during /// search where I = L If `equate_il` is set to false, we need to check if the I and @@ -419,20 +391,10 @@ impl Searcher { skip: usize, il_locations: &[usize], search_string: &[u8], - index_string: &[u8], + text_slice: ProteinTextSlice, equate_il: bool ) -> bool { - if equate_il { - true - } else { - for &il_location in il_locations { - let index = il_location - skip; - if search_string[index] != index_string[index] { - return false; - } - } - true - } + if equate_il { true } else { text_slice.check_il_locations(skip, il_locations, search_string) } } /// Returns all the proteins that correspond with the provided suffixes @@ -459,6 +421,7 @@ impl Searcher { #[cfg(test)] mod tests { use sa_mappings::proteins::{Protein, Proteins}; + use text_compression::ProteinText; use crate::{ sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, @@ -487,9 +450,11 @@ mod tests { } fn get_example_proteins() -> Proteins { - let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes(); + let input_string = "AI-CLACVAA-AC-KCRLY$"; + let text = ProteinText::from_string(input_string); + Proteins { - input_string: text, + text, proteins: vec![ Protein { uniprot_id: String::new(), @@ -520,7 +485,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'A' @@ -541,7 +506,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search suffix 'VAA' @@ -558,14 +523,14 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); let bounds_res = searcher.search_bounds(&[b'I']); assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16))); // search bounds 'RIZ' with equal I and L - let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Z']); + let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Y']); assert_eq!(bounds_res, BoundSearchResult::SearchResult((17, 18))); } @@ -574,25 +539,26 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'RIZ' with equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16])); // search bounds 'RIZ' without equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, false); assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches); } // test edge case where an I or L is the first index in the sparse SA. #[test] fn test_l_first_index_in_sa() { - let text = "LMOXZ$".to_string().into_bytes(); + let input_string = "LMPYY$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -601,7 +567,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L @@ -611,10 +577,11 @@ mod tests { #[test] fn test_il_missing_matches() { - let text = "AAILLL$".to_string().into_bytes(); + let input_string = "AAILLL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -623,7 +590,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); @@ -632,10 +599,11 @@ mod tests { #[test] fn test_il_duplication() { - let text = "IIIILL$".to_string().into_bytes(); + let input_string = "IIIILL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -644,7 +612,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); @@ -653,10 +621,11 @@ mod tests { #[test] fn test_il_suffix_check() { - let text = "IIIILL$".to_string().into_bytes(); + let input_string = "IIIILL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -665,7 +634,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search all places where II is in the string IIIILL, but with a sparse SA @@ -676,10 +645,11 @@ mod tests { #[test] fn test_il_duplication2() { - let text = "IILLLL$".to_string().into_bytes(); + let input_string = "IILLLL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -688,7 +658,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 121b569..a6a4e93 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -1,5 +1,6 @@ use clap::ValueEnum; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; +use text_compression::ProteinText; use crate::Nullable; @@ -66,10 +67,10 @@ impl DenseSuffixToProtein { /// # Returns /// /// Returns a new DenseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { + pub fn new(text: &ProteinText) -> Self { let mut current_protein_index: u32 = 0; let mut suffix_index_to_protein: Vec = vec![]; - for &char in text.iter() { + for char in text.iter() { if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { current_protein_index += 1; suffix_index_to_protein.push(u32::NULL); @@ -92,9 +93,9 @@ impl SparseSuffixToProtein { /// # Returns /// /// Returns a new SparseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { + pub fn new(text: &ProteinText) -> Self { let mut suffix_index_to_protein: Vec = vec![0]; - for (index, &char) in text.iter().enumerate() { + for (index, char) in text.iter().enumerate() { if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { suffix_index_to_protein.push(index as i64 + 1); } @@ -108,6 +109,7 @@ impl SparseSuffixToProtein { mod tests { use clap::ValueEnum; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; + use text_compression::ProteinText; use crate::{ suffix_to_protein_index::{ @@ -116,10 +118,10 @@ mod tests { Nullable }; - fn build_text() -> Vec { + fn build_text() -> ProteinText { let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char)); text.push(TERMINATION_CHARACTER as char); - text.into_bytes() + ProteinText::from_string(&text) } #[test] diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml index b20a2bf..d255f7c 100644 --- a/sa-mappings/Cargo.toml +++ b/sa-mappings/Cargo.toml @@ -11,3 +11,5 @@ tempdir = "0.3.7" [dependencies] fa-compression = { path = "../fa-compression" } bytelines = "2.5.0" +bitarray = { path = "../bitarray" } +text-compression = { path = "../text-compression" } diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index f2b24cc..53e52b8 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -5,6 +5,7 @@ use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8}; use bytelines::ByteLines; use fa_compression::algorithm1::{decode, encode}; +use text_compression::ProteinText; /// The separation character used in the input string pub static SEPARATION_CHARACTER: u8 = b'-'; @@ -28,7 +29,7 @@ pub struct Protein { /// A struct that represents a collection of proteins pub struct Proteins { /// The input string containing all proteins - pub input_string: Vec, + pub text: ProteinText, /// The proteins in the input string pub proteins: Vec @@ -46,7 +47,6 @@ impl Proteins { /// /// # Arguments /// * `file` - The path to the database file - /// * `taxon_aggregator` - The `TaxonAggregator` to use /// /// # Returns /// @@ -86,16 +86,52 @@ impl Proteins { input_string.pop(); input_string.push(TERMINATION_CHARACTER.into()); - input_string.shrink_to_fit(); proteins.shrink_to_fit(); - Ok(Self { input_string: input_string.into_bytes(), proteins }) + + let text = ProteinText::from_string(&input_string); + Ok(Self { text, proteins }) + } + + /// Creates a `ProteinText` which represents all the proteins concatenated from the database file + /// + /// # Arguments + /// * `file` - The path to the database file + /// + /// # Returns + /// + /// Returns a `Result` containing the `ProteinText` + /// + /// # Errors + /// + /// Returns a `Box` if an error occurred while reading the database file + pub fn try_from_database_file_without_annotations(database_file: &str) -> Result> { + let mut input_string: String = String::new(); + + let file = File::open(database_file)?; + + // Read the lines as bytes, since the input string is not guaranteed to be utf8 + // because of the encoded functional annotations + let mut lines = ByteLines::new(BufReader::new(file)); + + while let Some(Ok(line)) = lines.next() { + let mut fields = line.split(|b| *b == b'\t'); + + // only get the taxon id and sequence from each line, we don't need the other parts + let sequence = from_utf8(fields.nth(2).unwrap())?; + + input_string.push_str(&sequence.to_uppercase()); + input_string.push(SEPARATION_CHARACTER.into()); + } + + let text = ProteinText::from_string(&input_string); + + Ok(text) } /// Creates a `vec` which represents all the proteins concatenated from the database file /// /// # Arguments /// * `file` - The path to the database file - /// * `taxon_aggregator` - The `TaxonAggregator` to use /// /// # Returns /// @@ -104,7 +140,7 @@ impl Proteins { /// # Errors /// /// Returns a `Box` if an error occurred while reading the database file - pub fn try_from_database_file_without_annotations(database_file: &str) -> Result, Box> { + pub fn try_from_database_file_uncompressed(database_file: &str) -> Result, Box> { let mut input_string: String = String::new(); let file = File::open(database_file)?; @@ -181,8 +217,10 @@ mod tests { #[test] fn test_new_proteins() { + let input_string = "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG"; + let text = ProteinText::from_string(&input_string); let proteins = Proteins { - input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes().to_vec(), + text, proteins: vec![ Protein { uniprot_id: "P12345".to_string(), @@ -197,7 +235,6 @@ mod tests { ] }; - assert_eq!(proteins.input_string, "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes()); assert_eq!(proteins.proteins.len(), 2); assert_eq!(proteins[0].uniprot_id, "P12345"); assert_eq!(proteins[0].taxon_id, 1); @@ -245,12 +282,7 @@ mod tests { let proteins = Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap()).unwrap(); - let sep_char = SEPARATION_CHARACTER as char; - let end_char = TERMINATION_CHARACTER as char; - let expected = format!( - "MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}", - sep_char, sep_char, sep_char, end_char - ); - assert_eq!(proteins, expected.as_bytes()); + let expected = 'L' as u8; + assert_eq!(proteins.get(4), expected); } } diff --git a/text-compression/Cargo.toml b/text-compression/Cargo.toml new file mode 100644 index 0000000..c312a3c --- /dev/null +++ b/text-compression/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "text-compression" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitarray = { path = "../bitarray" } diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs new file mode 100644 index 0000000..4866a6c --- /dev/null +++ b/text-compression/src/lib.rs @@ -0,0 +1,632 @@ +use std::{ + collections::HashMap, + error::Error, + io::{BufRead, Write} +}; + +use bitarray::{data_to_writer, Binary, BitArray}; + +/// Structure representing the proteins, stored in a bit array using 5 bits per amino acid. +pub struct ProteinText { + /// Bit array holding the sequence of amino acids + bit_array: BitArray, + /// Hashmap storing the mapping between the character as `u8` and a 5 bit number. + char_to_5bit: HashMap, + /// Vector storing the mapping between the 5 bit number and the character as `u8`. + bit5_to_char: Vec +} + +impl ProteinText { + /// Creates the hashmap storing the mappings between the characters as `u8` and 5 bit numbers. + /// + /// # Returns + /// + /// Returns the hashmap + fn create_char_to_5bit_hashmap() -> HashMap { + let mut hashmap = HashMap::::new(); + for (i, c) in "ACDEFGHIKLMNPQRSTVWY-$".chars().enumerate() { + hashmap.insert(c as u8, i as u8); + } + + hashmap + } + + /// Creates the vector storing the mappings between the 5 bit numbers and the characters as `u8`. + /// + /// # Returns + /// + /// Returns the vector + fn create_bit5_to_char() -> Vec { + let mut vec = Vec::::new(); + for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() { + vec.push(c as u8); + } + vec + } + + /// Creates the compressed text from a string. + /// + /// # Arguments + /// * `input_string` - The text (proteins) in string format + /// + /// # Returns + /// + /// An instance of `ProteinText` + pub fn from_string(input_string: &str) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + let mut bit_array = BitArray::with_capacity(input_string.len(), 5); + for (i, c) in input_string.chars().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + Self { bit_array, char_to_5bit, bit5_to_char } + } + + /// Creates the compressed text from a vector. + /// + /// # Arguments + /// * `input_vec` - The text (proteins) in a vector with elements of type `u8` representing the amino acids. + /// + /// # Returns + /// + /// An instance of `ProteinText` + pub fn from_vec(input_vec: &[u8]) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + let mut bit_array = BitArray::with_capacity(input_vec.len(), 5); + for (i, e) in input_vec.iter().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + Self { bit_array, char_to_5bit, bit5_to_char } + } + + /// Creates the compressed text from a bit array. + /// + /// # Arguments + /// * `bit_array` - The text (proteins) in a bit array using 5 bits for each amino acid. + /// + /// # Returns + /// + /// An instance of `ProteinText` + pub fn new(bit_array: BitArray) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + Self { bit_array, char_to_5bit, bit5_to_char } + } + + /// Creates an instance of `ProteinText` with a given capacity. + /// + /// # Arguments + /// * `capacity` - The amount of characters in the text. + /// + /// # Returns + /// + /// An instance of `ProteinText` + pub fn with_capacity(capacity: usize) -> Self { + Self::new(BitArray::with_capacity(capacity, 5)) + } + + /// Search the character at a given position in the compressed text. + /// + /// # Arguments + /// * `index` - The index of the character to search. + /// + /// # Returns + /// + /// the character at position `index` as `u8`. + pub fn get(&self, index: usize) -> u8 { + let char_5bit = self.bit_array.get(index) as usize; + self.bit5_to_char[char_5bit] + } + + /// Set the character at a given index. + /// + /// # Arguments + /// * `index` - The index of the character to change. + /// * `value` - The character to fill in as `u8`. + pub fn set(&mut self, index: usize, value: u8) { + let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet"); + self.bit_array.set(index, char_5bit as u64); + } + + /// Queries the length of the text. + /// + /// # Returns + /// + /// the length of the text + pub fn len(&self) -> usize { + self.bit_array.len() + } + + /// Check if the text is empty (length 0). + /// + /// # Returns + /// + /// true if the the text has length 0, false otherwise. + pub fn is_empty(&self) -> bool { + self.bit_array.len() == 0 + } + + /// Clears the `BitArray`, setting all bits to 0. + pub fn clear(&mut self) { + self.bit_array.clear() + } + + /// Get an iterator over the characters of the text. + /// + /// # Returns + /// + /// A `ProteinTextIterator`, which can iterate over the characters of the text. + pub fn iter(&self) -> ProteinTextIterator { + ProteinTextIterator { protein_text: self, index: 0 } + } + + /// Get a slice of the text + /// + /// # Returns + /// + /// An `ProteinTextSlice` representing a slice of the text. + pub fn slice(&self, start: usize, end: usize) -> ProteinTextSlice { + ProteinTextSlice::new(self, start, end) + } +} + +/// Structure representing a slice of a `ProteinText`. +pub struct ProteinTextSlice<'a> { + /// The `Proteintext` of whih to take a slice. + text: &'a ProteinText, + /// The start of the slice. + start: usize, // included + /// The end of the slice. + end: usize // excluded +} + +impl<'a> ProteinTextSlice<'a> { + /// Creates an instance of `ProteintextSlice`, given the text and boundaries. + /// + /// # Arguments + /// * `text` - The `Proteintext` representing the text of proteins with 5 bits per amino acid. + /// * `start` - The start of the slice. + /// * `end` - The end of the slice. + /// + /// # Returns + /// + /// An instance of `ProteinTextSlice` + pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice<'a> { + Self { text, start, end } + } + + /// Get a character (amino acid) in the slice. + /// + /// # Arguments + /// * `index` - The index in the slice of the character to get. + /// + /// # Returns + /// + /// The character as `u8`. + pub fn get(&self, index: usize) -> u8 { + self.text.get(self.start + index) + } + + /// Get the length of the slice. + /// + /// # Returns + /// + /// The length of the slice. + pub fn len(&self) -> usize { + self.end - self.start + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Checks if the slice and a given array of `u8` are equal. + /// I and L can be equated. + /// + /// # Arguments + /// * `other` - the array of `u8` to compare the slice with. + /// * `equate_il` - true if I and L need to be equated, false otherwise. + /// + /// # Returns + /// + /// True if the slice is equal to the given array, false otherwise. + #[inline] + pub fn equals_slice(&self, other: &[u8], equate_il: bool) -> bool { + if equate_il { + other.iter().zip(self.iter()).all(|(&search_character, text_character)| { + search_character == text_character + || (search_character == b'I' && text_character == b'L') + || (search_character == b'L' && text_character == b'I') + }) + } else { + other + .iter() + .zip(self.iter()) + .all(|(&search_character, text_character)| search_character == text_character) + } + } + + /// Check if the slice and a given array of `u8` are equal on the I and L positions. + /// + /// # Arguments + /// * `skip` - The amount of positions this slice skipped, this has an influence on the I and L positions. + /// * `il_locations` - The positions where I and L occur. + /// * `search_string` - An array of `u8` to compare the slice with. + /// + /// # Returns + /// + /// True if the slice and `search_string` have the same contents on the I and L positions, false otherwise. + pub fn check_il_locations(&self, skip: usize, il_locations: &[usize], search_string: &[u8]) -> bool { + for &il_location in il_locations { + let index = il_location - skip; + if search_string[index] != self.get(index) { + return false; + } + } + true + } + + /// Get an iterator over the slice. + /// + /// # Returns + /// + /// An iterator over the slice. + pub fn iter(&self) -> ProteinTextSliceIterator { + ProteinTextSliceIterator { text_slice: self, index: 0 } + } +} + +/// Structure representing an iterator over a `ProteinText` instance, iterating the characters of the text. +pub struct ProteinTextIterator<'a> { + protein_text: &'a ProteinText, + index: usize +} + +/// Structure representing an iterator over a `ProteintextSlice` instance, iterating the characters of the slice. +pub struct ProteinTextSliceIterator<'a> { + text_slice: &'a ProteinTextSlice<'a>, + index: usize +} + +impl<'a> Iterator for ProteinTextSliceIterator<'a> { + type Item = u8; + + /// Get the next character in the `ProteinTextSlice`. + /// + /// # Returns + /// + /// The next character in the slice. + fn next(&mut self) -> Option { + if self.index >= self.text_slice.len() { + return None; + } + + self.index += 1; + Some(self.text_slice.get(self.index - 1)) + } +} + +impl<'a> Iterator for ProteinTextIterator<'a> { + type Item = u8; + + /// Get the next character in the `ProteinText`. + /// + /// # Returns + /// + /// The next character in the text. + fn next(&mut self) -> Option { + if self.index >= self.protein_text.len() { + return None; + } + + self.index += 1; + Some(self.protein_text.get(self.index - 1)) + } +} + +/// Writes the compressed text to a writer. +/// +/// # Arguments +/// +/// * `text` - The text to be compressed. +/// * `writer` - The writer to which the compressed text will be written. +/// +/// # Errors +/// +/// Returns an error if writing to the writer fails. +pub fn dump_compressed_text(text: Vec, writer: &mut impl Write) -> Result<(), Box> { + let bits_per_value = 5; + + // Write the flags to the writer + // 00000001 indicates that the text is compressed + writer + .write(&[bits_per_value as u8]) + .map_err(|_| "Could not write the required bits to the writer")?; + + // Write the size of the text to the writer + writer + .write(&(text.len() as u64).to_le_bytes()) + .map_err(|_| "Could not write the size of the text to the writer")?; + + // Compress the text and write it to the writer + let text_writer: Vec = text.iter().map(|item| ::from(*item)).collect(); + data_to_writer(text_writer, bits_per_value, 8 * 1024, writer) + .map_err(|_| "Could not write the compressed text to the writer")?; + + Ok(()) +} + +/// Load the compressed text from a reader. +/// +/// # Arguments +/// +/// * `reader` - The reader from which the compressed text will be read. +/// +/// # Errors +/// +/// Returns an error if reading from the reader fails. +pub fn load_compressed_text(reader: &mut impl BufRead) -> Result> { + let bits_per_value: usize = 5; + // Read the size of the text from the binary file (8 bytes) + let mut size_buffer = [0_u8; 8]; + reader + .read_exact(&mut size_buffer) + .map_err(|_| "Could not read the size of the text from the binary file")?; + let size = u64::from_le_bytes(size_buffer) as usize; + + // Read the compressed text from the binary file + let mut compressed_text = BitArray::with_capacity(size, bits_per_value); + compressed_text + .read_binary(reader) + .map_err(|_| "Could not read the compressed text from the binary file")?; + + Ok(ProteinText::new(compressed_text)) +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + + pub struct FailingWriter { + /// The number of times the write function can be called before it fails. + pub valid_write_count: usize + } + + impl Write for FailingWriter { + fn write(&mut self, _: &[u8]) -> Result { + if self.valid_write_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed")); + } + + self.valid_write_count -= 1; + Ok(1) + } + + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + + pub struct FailingReader { + /// The number of times the read function can be called before it fails. + pub valid_read_count: usize + } + + impl Read for FailingReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.valid_read_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed")); + } + + self.valid_read_count -= 1; + Ok(buf.len()) + } + } + + impl BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Ok(&[]) + } + + fn consume(&mut self, _: usize) {} + } + + #[test] + fn test_u8_5bit_conversion() { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() { + let char_5bit = char_to_5bit.get(&(c as u8)).unwrap(); + assert_eq!(c as u8, bit5_to_char[*char_5bit as usize]); + } + } + + #[test] + fn test_build_from_string() { + let text = ProteinText::from_string("ACACA-CAC$"); + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_build_from_vec() { + let vec = vec![b'A', b'C', b'A', b'C', b'A', b'-', b'C', b'A', b'C', b'$']; + let text = ProteinText::from_vec(&vec); + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_build_from_bitarray() { + let input_string = "ACACA-CAC$"; + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + + let mut bit_array = BitArray::with_capacity(input_string.len(), 5); + for (i, c) in input_string.chars().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + let text = ProteinText::new(bit_array); + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_build_with_capacity() { + let input_string = "ACACA-CAC$"; + + let mut text = ProteinText::with_capacity(input_string.len()); + for (i, c) in "ACACA-CAC$".chars().enumerate() { + text.set(i, c as u8); + } + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_text_slice() { + let input_string = "ACACA-CAC$"; + let start = 1; + let end = 5; + let text = ProteinText::from_string(&input_string); + let text_slice = text.slice(start, end); + + for (i, c) in input_string[start..end].chars().enumerate() { + assert_eq!(c as u8, text_slice.get(i)); + } + } + + #[test] + fn test_equals_slice() { + let input_string = "ACICA-CAC$"; + let text = ProteinText::from_string(&input_string); + let text_slice = text.slice(1, 5); + let eq_slice_true = [b'C', b'I', b'C', b'A']; + let eq_slice_false = [b'C', b'C', b'C', b'A']; + let eq_slice_il_true = [b'C', b'L', b'C', b'A']; + + assert!(text_slice.equals_slice(&eq_slice_true, false)); + assert!(!text_slice.equals_slice(&eq_slice_false, false)); + assert!(text_slice.equals_slice(&eq_slice_il_true, true)); + } + + #[test] + fn test_check_il_locations() { + let input_string = "ACILA-CAC$"; + let text = ProteinText::from_string(&input_string); + let text_slice = text.slice(1, 5); + let il_locations = [1, 2]; + let il_true = [b'C', b'I', b'L', b'A']; + let il_false = [b'C', b'I', b'C', b'A']; + + assert!(text_slice.check_il_locations(0, &il_locations, &il_true)); + assert!(!text_slice.check_il_locations(0, &il_locations, &il_false)); + } + + #[test] + fn test_dump_compressed_text() { + let text: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + + let mut writer = vec![]; + dump_compressed_text(text, &mut writer).unwrap(); + + assert_eq!(writer, vec![ + // bits per value + 5, // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8 + ]); + } + + #[test] + #[should_panic(expected = "Could not write the required bits to the writer")] + fn test_dump_compressed_text_fail_required_bits() { + let mut writer = FailingWriter { valid_write_count: 0 }; + + dump_compressed_text(vec![], &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the size of the text to the writer")] + fn test_dump_compressed_text_fail_size() { + let mut writer = FailingWriter { valid_write_count: 1 }; + + dump_compressed_text(vec![], &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the compressed text to the writer")] + fn test_dump_compressed_text_fail_compressed_text() { + let mut writer = FailingWriter { valid_write_count: 3 }; + + dump_compressed_text(vec![1], &mut writer).unwrap(); + } + + #[test] + fn test_load_compressed_text() { + let data = vec![ + // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8, + ]; + + let mut reader = std::io::BufReader::new(&data[..]); + let compressed_text = load_compressed_text(&mut reader).unwrap(); + + for (i, c) in "CDEFGHIKLM".chars().enumerate() { + assert_eq!(compressed_text.get(i), c as u8); + } + } + + #[test] + #[should_panic(expected = "Could not read the size of the text from the binary file")] + fn test_load_compressed_text_fail_size() { + let mut reader = FailingReader { valid_read_count: 0 }; + + load_compressed_text(&mut reader).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the compressed text from the binary file")] + fn test_load_compressed_text_fail_compressed_text() { + let mut reader = FailingReader { valid_read_count: 2 }; + + load_compressed_text(&mut reader).unwrap(); + } + + #[test] + fn test_failing_writer() { + let mut writer = FailingWriter { valid_write_count: 0 }; + assert!(writer.flush().is_ok()); + assert!(writer.write(&[0]).is_err()); + } + + #[test] + fn test_failing_reader() { + let mut reader = FailingReader { valid_read_count: 0 }; + let right_buffer: [u8; 0] = []; + assert_eq!(reader.fill_buf().unwrap(), &right_buffer); + assert_eq!(reader.consume(0), ()); + let mut buffer = [0_u8; 1]; + assert!(reader.read(&mut buffer).is_err()); + } +}