From 859a66bb67cc5fe05873e6ef9afc823986b30390 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Tue, 28 May 2024 18:18:29 +0200 Subject: [PATCH] more debug info + some movement of SA code --- .gitignore | 3 + Cargo.lock | 1 + bitarray/src/lib.rs | 15 +++++ fa-compression/src/algorithm1/decode.rs | 4 +- fa-compression/src/algorithm1/encode.rs | 2 +- fa-compression/src/algorithm1/mod.rs | 6 +- sa-builder/src/main.rs | 34 ++++++++--- sa-compression/Cargo.toml | 1 + sa-compression/src/lib.rs | 15 ++--- sa-index/src/binary.rs | 14 +++-- sa-index/src/lib.rs | 78 +++++++++++++++++++++---- sa-index/src/peptide_search.rs | 2 +- sa-index/src/sa_searcher.rs | 42 +++++-------- sa-mappings/src/proteins.rs | 38 ++++++------ sa-server/src/main.rs | 37 +++++++----- 15 files changed, 194 insertions(+), 98 deletions(-) diff --git a/.gitignore b/.gitignore index 2f7896d..f534053 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ target/ +data/ + +.DS_Store diff --git a/Cargo.lock b/Cargo.lock index 8b2e119..900c218 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1422,6 +1422,7 @@ name = "sa-compression" version = "0.1.0" dependencies = [ "bitarray", + "sa-index", ] [[package]] diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index ce2d9ef..0a5f647 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -120,6 +120,15 @@ impl BitArray { self.data[end_block] |= value << (64 - end_block_offset); } + /// Returns the number of bits in a single value. + /// + /// # Returns + /// + /// The number of bits in a single value. + pub fn bits_per_value(&self) -> usize { + self.bits_per_value + } + /// Returns the length of the `BitArray`. /// /// # Returns @@ -266,6 +275,12 @@ mod tests { assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]); } + #[test] + fn test_bitarray_bits_per_value() { + let bitarray = BitArray::with_capacity(4, 40); + assert_eq!(bitarray.bits_per_value(), 40); + } + #[test] fn test_bitarray_len() { let bitarray = BitArray::with_capacity(4, 40); diff --git a/fa-compression/src/algorithm1/decode.rs b/fa-compression/src/algorithm1/decode.rs index c24fc4b..5295a58 100644 --- a/fa-compression/src/algorithm1/decode.rs +++ b/fa-compression/src/algorithm1/decode.rs @@ -28,9 +28,9 @@ static PREFIXES: [&str; 3] = ["EC:", "GO:", "IPR:IPR"]; /// ``` /// use fa_compression::algorithm1::decode; /// -/// let input = &[ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117, 225, 67, 116, 110, 17, 153, 39 ]; +/// let input = &[ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ]; /// let result = decode(input); -/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364;IPR:IPR032635;IPR:IPR008816"); +/// assert_eq!(result, "EC:1.1.1.-;GO:0009279;IPR:IPR016364"); /// ``` pub fn decode(input: &[u8]) -> String { if input.is_empty() { diff --git a/fa-compression/src/algorithm1/encode.rs b/fa-compression/src/algorithm1/encode.rs index e33351d..0877c9a 100644 --- a/fa-compression/src/algorithm1/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -28,7 +28,7 @@ use super::{ /// let input = "IPR:IPR016364;EC:1.1.1.-;GO:0009279"; /// let encoded = encode(input); /// -/// assert_eq!(encoded, vec![ 44, 44, 44, 189, 17, 26, 56, 173, 18, 116, 117 ]); +/// assert_eq!(encoded, vec![ 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117 ]); /// ``` pub fn encode(input: &str) -> Vec { if input.is_empty() { diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs index 5c62e7c..a495c9e 100644 --- a/fa-compression/src/algorithm1/mod.rs +++ b/fa-compression/src/algorithm1/mod.rs @@ -170,8 +170,10 @@ impl BitOr for CharacterSet { mod tests { use super::*; - static CHARACTERS: [u8; 16] = - [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', b';']; + static CHARACTERS: [u8; 16] = [ + b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', + b';' + ]; static CHARACTER_SETS: [CharacterSet; 16] = [ CharacterSet::Empty, diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index a20702a..812bd7d 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -31,38 +31,54 @@ fn main() { compress_sa } = Arguments::parse(); + eprintln!(); + eprintln!("📋 Started loading the taxon file..."); let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("✅ Successfully loaded the taxon file!"); + eprintln!("\tAggregation method: LCA*"); - // read input + eprintln!(); + eprintln!("📋 Started loading the proteins..."); let mut data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!("✅ Successfully loaded the proteins!"); - // calculate sparse suffix array + eprintln!(); + eprintln!("📋 Started building the suffix array..."); let sa = build_ssa(&mut data, &construction_algorithm, sparseness_factor) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - - eprintln!("Suffix array constructed successfully."); - eprintln!("sa length: {}", sa.len()); + eprintln!("✅ Successfully built the suffix array!"); + eprintln!("\tAmount of items: {}", sa.len()); + eprintln!("\tSample rate: {}", sparseness_factor); // open the output file let mut file = open_file(&output).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + eprintln!(); + eprintln!("📋 Started dumping the suffix array..."); + if compress_sa { let bits_per_value = (data.len() as f64).log2().ceil() as usize; - eprintln!("Compressing suffix array with {} bits per value.", bits_per_value); - if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { eprint_and_exit(err.to_string().as_str()); }; - } else if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { - eprint_and_exit(err.to_string().as_str()); + + eprintln!("✅ Successfully dumped the suffix array!"); + eprintln!("\tAmount of bits per item: {}", bits_per_value); + } else { + if let Err(err) = dump_suffix_array(&sa, sparseness_factor, &mut file) { + eprint_and_exit(err.to_string().as_str()); + } + + eprintln!("✅ Successfully dumped the suffix array!"); + eprintln!("\tAmount of bits per item: 64"); } } diff --git a/sa-compression/Cargo.toml b/sa-compression/Cargo.toml index a53939b..70a6cbf 100644 --- a/sa-compression/Cargo.toml +++ b/sa-compression/Cargo.toml @@ -7,3 +7,4 @@ edition = "2021" [dependencies] bitarray = { path = "../bitarray" } +sa-index = { path = "../sa-index" } diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index b7bac78..85a41df 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -11,6 +11,7 @@ use bitarray::{ Binary, BitArray }; +use sa_index::SuffixArray; /// Writes the compressed suffix array to a writer. /// @@ -66,7 +67,7 @@ pub fn dump_compressed_suffix_array( pub fn load_compressed_suffix_array( reader: &mut impl BufRead, bits_per_value: usize -) -> Result<(u8, BitArray), Box> { +) -> Result> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; reader @@ -87,7 +88,7 @@ pub fn load_compressed_suffix_array( .read_binary(reader) .map_err(|_| "Could not read the compressed suffix array from the binary file")?; - Ok((sample_rate, compressed_suffix_array)) + Ok(SuffixArray::Compressed(compressed_suffix_array, sample_rate)) } #[cfg(test)] @@ -209,12 +210,11 @@ mod tests { ]; let mut reader = std::io::BufReader::new(&data[..]); - let (sample_rate, compressed_suffix_array) = - load_compressed_suffix_array(&mut reader, 8).unwrap(); + let compressed_suffix_array = load_compressed_suffix_array(&mut reader, 8).unwrap(); - assert_eq!(sample_rate, 1); + assert_eq!(compressed_suffix_array.sample_rate(), 1); for i in 0 .. 10 { - assert_eq!(compressed_suffix_array.get(i), i as u64 + 1); + assert_eq!(compressed_suffix_array.get(i), i as i64 + 1); } } @@ -262,7 +262,8 @@ mod tests { let mut reader = FailingReader { valid_read_count: 0 }; - assert_eq!(reader.fill_buf().unwrap(), &[]); + let right_buffer: [u8; 0] = []; + assert_eq!(reader.fill_buf().unwrap(), &right_buffer); assert_eq!(reader.consume(0), ()); let mut buffer = [0_u8; 1]; assert!(reader.read(&mut buffer).is_err()); diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index ae16a8f..5688d4a 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -7,6 +7,8 @@ use std::{ } }; +use crate::SuffixArray; + /// The `Binary` trait provides methods for reading and writing a struct as binary. pub trait Binary { /// Writes the struct as binary to the given writer. @@ -132,7 +134,7 @@ pub fn dump_suffix_array( /// # Errors /// /// Returns any error from opening the file or reading the file -pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Box> { +pub fn load_suffix_array(reader: &mut impl BufRead) -> Result> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; reader @@ -151,7 +153,7 @@ pub fn load_suffix_array(reader: &mut impl BufRead) -> Result<(u8, Vec), Bo sa.read_binary(reader) .map_err(|_| "Could not read the suffix array from the binary file")?; - Ok((sample_rate, sa)) + Ok(SuffixArray::Original(sa, sample_rate)) } /// Fills the buffer with data read from the input. @@ -374,10 +376,12 @@ mod tests { ]; let mut reader = buffer.as_slice(); - let (sample_rate, sa) = load_suffix_array(&mut reader).unwrap(); + let sa = load_suffix_array(&mut reader).unwrap(); - assert_eq!(sample_rate, 1); - assert_eq!(sa, vec![1, 2, 3, 4, 5]); + assert_eq!(sa.sample_rate(), 1); + for i in 0 .. 5 { + assert_eq!(sa.get(i), i as i64 + 1); + } } #[test] diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index ca13a82..f276906 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -8,9 +8,9 @@ pub mod suffix_to_protein_index; /// Represents a suffix array. pub enum SuffixArray { /// The original suffix array. - Original(Vec), + Original(Vec, u8), /// The compressed suffix array. - Compressed(BitArray) + Compressed(BitArray, u8) } impl SuffixArray { @@ -21,12 +21,36 @@ impl SuffixArray { /// The length of the suffix array. pub fn len(&self) -> usize { match self { - SuffixArray::Original(sa) => sa.len(), - SuffixArray::Compressed(sa) => sa.len() + SuffixArray::Original(sa, _) => sa.len(), + SuffixArray::Compressed(sa, _) => sa.len() } } - /// Returns the suffix array at the given index. + /// Returns the number of bits per value in the suffix array. + /// + /// # Returns + /// + /// The number of bits per value in the suffix array. + pub fn bits_per_value(&self) -> usize { + match self { + SuffixArray::Original(_, _) => 64, + SuffixArray::Compressed(sa, _) => sa.bits_per_value() + } + } + + /// Returns the sample rate used for the suffix array. + /// + /// # Returns + /// + /// The sample rate used for the suffix array. + pub fn sample_rate(&self) -> u8 { + match self { + SuffixArray::Original(_, sample_rate) => *sample_rate, + SuffixArray::Compressed(_, sample_rate) => *sample_rate + } + } + + /// Returns the suffix array value at the given index. /// /// # Arguments /// @@ -37,8 +61,8 @@ impl SuffixArray { /// The suffix array at the given index. pub fn get(&self, index: usize) -> i64 { match self { - SuffixArray::Original(sa) => sa[index], - SuffixArray::Compressed(sa) => sa.get(index) as i64 + SuffixArray::Original(sa, _) => sa[index], + SuffixArray::Compressed(sa, _) => sa.get(index) as i64 } } @@ -46,7 +70,7 @@ impl SuffixArray { /// /// # Returns /// - /// True if the suffix array is empty, false otherwise. + /// Returns `true` if the suffix array is empty, `false` otherwise. pub fn is_empty(&self) -> bool { self.len() == 0 } @@ -79,7 +103,7 @@ mod tests { #[test] fn test_suffix_array_original() { - let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5]); + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); assert_eq!(sa.len(), 5); assert_eq!(sa.get(0), 1); assert_eq!(sa.get(1), 2); @@ -97,7 +121,7 @@ mod tests { bitarray.set(3, 4); bitarray.set(4, 5); - let sa = SuffixArray::Compressed(bitarray); + let sa = SuffixArray::Compressed(bitarray, 1); assert_eq!(sa.len(), 5); assert_eq!(sa.get(0), 1); assert_eq!(sa.get(1), 2); @@ -106,13 +130,43 @@ mod tests { assert_eq!(sa.get(4), 5); } + #[test] + fn test_suffix_array_len() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.len(), 5); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.len(), 5); + } + + #[test] + fn test_suffix_array_bits_per_value() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.bits_per_value(), 64); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.bits_per_value(), 40); + } + + #[test] + fn test_suffix_array_sample_rate() { + let sa = SuffixArray::Original(vec![1, 2, 3, 4, 5], 1); + assert_eq!(sa.sample_rate(), 1); + + let bitarray = BitArray::with_capacity(5, 40); + let sa = SuffixArray::Compressed(bitarray, 1); + assert_eq!(sa.sample_rate(), 1); + } + #[test] fn test_suffix_array_is_empty() { - let sa = SuffixArray::Original(vec![]); + let sa = SuffixArray::Original(vec![], 1); assert_eq!(sa.is_empty(), true); let bitarray = BitArray::with_capacity(0, 0); - let sa = SuffixArray::Compressed(bitarray); + let sa = SuffixArray::Compressed(bitarray, 1); assert_eq!(sa.is_empty(), true); } diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 623eba6..7d83914 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -70,7 +70,7 @@ pub fn search_proteins_for_peptide<'a>( let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); // words that are shorter than the sample rate are not searchable - if peptide.len() < searcher.sparseness_factor as usize { + if peptide.len() < searcher.sa.sample_rate() as usize { return None; } diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 3c82ea4..29bbc9a 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -102,7 +102,6 @@ impl PartialEq for SearchAllSuffixesResult { /// the functional analysis provided by Unipept pub struct Searcher { pub sa: SuffixArray, - pub sparseness_factor: u8, pub suffix_index_to_protein: Box, pub proteins: Proteins, pub taxon_id_calculator: TaxonAggregator, @@ -128,7 +127,6 @@ impl Searcher { /// Returns a new Searcher object pub fn new( sa: SuffixArray, - sparseness_factor: u8, suffix_index_to_protein: Box, proteins: Proteins, taxon_id_calculator: TaxonAggregator, @@ -136,7 +134,6 @@ impl Searcher { ) -> Self { Self { sa, - sparseness_factor, suffix_index_to_protein, proteins, taxon_id_calculator, @@ -324,7 +321,7 @@ impl Searcher { } let mut skip: usize = 0; - while skip < self.sparseness_factor as usize { + while skip < self.sa.sample_rate() as usize { let mut il_locations_start = 0; while il_locations_start < il_locations.len() && il_locations[il_locations_start] < skip { @@ -654,16 +651,16 @@ mod tests { #[test] fn test_search_simple() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![ - 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, - ]); + let sa = SuffixArray::Original( + vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], + 1 + ); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -690,14 +687,13 @@ mod tests { #[test] fn test_search_sparse() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18]); + let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 3, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -721,16 +717,16 @@ mod tests { #[test] fn test_il_equality() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![ - 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, - ]); + let sa = SuffixArray::Original( + vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], + 1 + ); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -752,14 +748,13 @@ mod tests { #[test] fn test_il_equality_sparse() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18]); + let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); let searcher = Searcher::new( sa, - 3, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -798,10 +793,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![0, 2, 4]); + let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); let searcher = Searcher::new( sparse_sa, - 2, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -833,10 +827,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2]); + let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -867,10 +860,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0]); + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -901,10 +893,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0]); + let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); let searcher = Searcher::new( sparse_sa, - 2, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( @@ -937,10 +928,9 @@ mod tests { let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0]); + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let searcher = Searcher::new( sparse_sa, - 1, Box::new(SparseSuffixToProtein::new(&proteins.input_string)), proteins, TaxonAggregator::try_from_taxonomy_file( diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index 87178b0..e1d2f49 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -10,7 +10,10 @@ use std::{ }; use bytelines::ByteLines; -use fa_compression::algorithm1::{decode, encode}; +use fa_compression::algorithm1::{ + decode, + encode +}; use umgap::taxon::TaxonId; use crate::taxonomy::TaxonAggregator; @@ -185,26 +188,21 @@ mod tests { let database_file = tmp_dir.path().join("database.tsv"); let mut file = File::create(&database_file).unwrap(); - file.write("P12345\t1\tMLPGLALLLLAAWTARALEV\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P67890\t6\tKWDSDPSGTKTCIDT\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) - .unwrap(); - file.write("\n".as_bytes()).unwrap(); - file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\t".as_bytes()) - .unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) + file.write( + "P12345\t1\tMLPGLALLLLAAWTARALEV\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() + ) + .unwrap(); + file.write( + "P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n" + .as_bytes() + ) + .unwrap(); + file.write( + "P67890\t6\tKWDSDPSGTKTCIDT\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() + ) + .unwrap(); + file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes()) .unwrap(); - file.write("\n".as_bytes()).unwrap(); database_file } diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index 7fb3322..ef774b2 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -175,23 +175,34 @@ async fn start_server(args: Arguments) -> Result<(), Box> { taxonomy } = args; - eprintln!("Loading suffix array..."); - let (sample_rate, sa) = load_suffix_array_file(&index_file)?; - - eprintln!("Loading taxon file..."); + eprintln!(); + eprintln!("📋 Started loading the suffix array..."); + let sa = load_suffix_array_file(&index_file)?; + eprintln!("✅ Successfully loaded the suffix array!"); + eprintln!("\tAmount of items: {}", sa.len()); + eprintln!("\tAmount of bits per item: {}", sa.bits_per_value()); + eprintln!("\tSample rate: {}", sa.sample_rate()); + + eprintln!(); + eprintln!("📋 Started loading the taxon file..."); let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)?; + eprintln!("✅ Successfully loaded the taxon file!"); + eprintln!("\tAggregation method: LCA*"); + eprintln!(); + eprintln!("📋 Started creating the function aggregator..."); let function_aggregator = FunctionAggregator {}; + eprintln!("✅ Successfully created the function aggregator!"); - eprintln!("Loading proteins..."); + eprintln!(); + eprintln!("📋 Started loading the proteins..."); let proteins = Proteins::try_from_database_file(&database_file, &taxon_id_calculator)?; let suffix_index_to_protein = Box::new(SparseSuffixToProtein::new(&proteins.input_string)); + eprintln!("✅ Successfully loaded the proteins!"); - eprintln!("Creating searcher..."); let searcher = Arc::new(Searcher::new( sa, - sample_rate, suffix_index_to_protein, proteins, taxon_id_calculator, @@ -212,13 +223,15 @@ async fn start_server(args: Arguments) -> Result<(), Box> { .with_state(searcher); let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await?; - println!("server is ready..."); + + eprintln!(); + eprintln!("🚀 Server is ready..."); axum::serve(listener, app).await?; Ok(()) } -fn load_suffix_array_file(file: &str) -> Result<(u8, SuffixArray), Box> { +fn load_suffix_array_file(file: &str) -> Result> { // Open the suffix array file let mut sa_file = File::open(file)?; @@ -233,10 +246,8 @@ fn load_suffix_array_file(file: &str) -> Result<(u8, SuffixArray), Box