diff --git a/Cargo.lock b/Cargo.lock index 3f82911..8b2e119 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1430,10 +1430,12 @@ version = "0.1.0" dependencies = [ "bitarray", "clap 4.5.4", + "fa-compression", "rayon", "sa-mappings", "serde", "serde_json", + "tempdir", "umgap", ] diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index 986dc3c..557549c 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -5,6 +5,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dev-dependencies] +tempdir = "0.3.7" +fa-compression = { path = "../fa-compression" } + [dependencies] clap = { version = "4.4.8", features = ["derive"] } umgap = "1.1.0" diff --git a/sa-index/out.txt b/sa-index/out.txt new file mode 100644 index 0000000..0bd0be1 --- /dev/null +++ b/sa-index/out.txt @@ -0,0 +1,1515 @@ +#![feature(prelude_import)] +#[prelude_import] +use std::prelude::rust_2021::*; +#[macro_use] +extern crate std; +use bitarray::BitArray; +pub mod binary { + use std::{error::Error, io::{BufRead, Read, Write}}; + /// The `Binary` trait provides methods for reading and writing a struct as binary. + pub trait Binary { + /// Writes the struct as binary to the given writer. + /// + /// # Arguments + /// + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. + fn write_binary(&self, writer: &mut W) -> std::io::Result<()>; + /// Reads binary data into a struct from the given reader. + /// + /// # Arguments + /// + /// * `reader` - The reader to read the binary data from. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `Err` if an error occurs. + fn read_binary(&mut self, reader: R) -> std::io::Result<()>; + } + /// Implements the `Binary` trait for `Vec`. + impl Binary for Vec { + /// Writes the elements of the vector to a binary file. + /// + /// # Arguments + /// + /// * `writer` - The writer to which the binary data will be written. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `std::io::Error` otherwise. + fn write_binary(&self, writer: &mut W) -> std::io::Result<()> { + for value in self { + writer.write_all(&value.to_le_bytes())?; + } + Ok(()) + } + /// Reads binary data from a reader and populates the vector with the read values. + /// + /// # Arguments + /// + /// * `reader` - The reader from which the binary data will be read. + /// + /// # Returns + /// + /// Returns `Ok(())` if the read operation is successful, or an `std::io::Error` otherwise. + fn read_binary(&mut self, mut reader: R) -> std::io::Result<()> { + self.clear(); + let mut buffer = ::alloc::vec::from_elem(0, 8 * 1024); + loop { + let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); + } + if finished { + break; + } + } + Ok(()) + } + } + /// Writes the suffix array to a binary file. + /// + /// # Arguments + /// + /// * `sa` - The suffix array to dump. + /// * `sparseness_factor` - The sparseness factor to write to the file. + /// * `writer` - The writer to write the binary data to. + /// + /// # Returns + /// + /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. + pub fn dump_suffix_array( + sa: &Vec, + sparseness_factor: u8, + writer: &mut impl Write, + ) -> Result<(), Box> { + writer + .write(&[64_u8]) + .map_err(|_| "Could not write the required bits to the writer")?; + writer + .write(&[sparseness_factor]) + .map_err(|_| "Could not write the sparseness factor to the writer")?; + let sa_len = sa.len(); + writer + .write(&(sa_len).to_le_bytes()) + .map_err(|_| "Could not write the size of the suffix array to the writer")?; + sa.write_binary(writer) + .map_err(|_| "Could not write the suffix array to the writer")?; + Ok(()) + } + /// Loads the suffix array from the file with the given `filename` + /// + /// # Arguments + /// * `filename` - The filename of the file where the suffix array is stored + /// + /// # Returns + /// + /// Returns the sample rate of the suffix array, together with the suffix array + /// + /// # Errors + /// + /// Returns any error from opening the file or reading the file + pub fn load_suffix_array( + reader: &mut impl BufRead, + ) -> Result<(u8, Vec), Box> { + let mut sample_rate_buffer = [0_u8; 1]; + reader + .read_exact(&mut sample_rate_buffer) + .map_err(|_| "Could not read the sample rate from the binary file")?; + let sample_rate = sample_rate_buffer[0]; + let mut size_buffer = [0_u8; 8]; + reader + .read_exact(&mut size_buffer) + .map_err(|_| { + "Could not read the size of the suffix array from the binary file" + })?; + let size = u64::from_le_bytes(size_buffer) as usize; + let mut sa = Vec::with_capacity(size); + sa.read_binary(reader) + .map_err(|_| "Could not read the suffix array from the binary file")?; + Ok((sample_rate, sa)) + } + /// Fills the buffer with data read from the input. + /// + /// # Arguments + /// + /// * `input` - The input source to read data from. + /// * `buffer` - The buffer to fill with data. + /// + /// # Returns + /// + /// Returns a tuple `(finished, bytes_read)` where `finished` indicates whether the end of the input + /// is reached, and `bytes_read` is the number of bytes read into the buffer. + fn fill_buffer( + input: &mut T, + buffer: &mut Vec, + ) -> std::io::Result<(bool, usize)> { + let buffer_size = buffer.len(); + let mut writable_buffer_space = buffer.as_mut(); + loop { + match input.read(writable_buffer_space) { + Ok(0) => { + return Ok(( + !writable_buffer_space.is_empty(), + buffer_size - writable_buffer_space.len(), + )); + } + Ok(bytes_read) => { + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); + } + Err(e) => { + return Err(e); + } + } + } + } +} +pub mod peptide_search { + use rayon::prelude::*; + use sa_mappings::{functionality::FunctionalAggregation, proteins::Protein}; + use serde::Serialize; + use crate::sa_searcher::{SearchAllSuffixesResult, Searcher}; + /// Struct representing a collection of `SearchResultWithAnalysis` or `SearchOnlyResult` results + pub struct OutputData { + result: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for OutputData { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field1_finish( + f, + "OutputData", + "result", + &&self.result, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for OutputData + where + T: _serde::Serialize, + { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "OutputData", + false as usize + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "result", + &self.result, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Struct representing the search result of the `sequence` in the index, including the analyses + pub struct SearchResultWithAnalysis { + sequence: String, + lca: Option, + taxa: Vec, + uniprot_accession_numbers: Vec, + fa: Option, + cutoff_used: bool, + } + #[automatically_derived] + impl ::core::fmt::Debug for SearchResultWithAnalysis { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + let names: &'static _ = &[ + "sequence", + "lca", + "taxa", + "uniprot_accession_numbers", + "fa", + "cutoff_used", + ]; + let values: &[&dyn ::core::fmt::Debug] = &[ + &self.sequence, + &self.lca, + &self.taxa, + &self.uniprot_accession_numbers, + &self.fa, + &&self.cutoff_used, + ]; + ::core::fmt::Formatter::debug_struct_fields_finish( + f, + "SearchResultWithAnalysis", + names, + values, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for SearchResultWithAnalysis { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "SearchResultWithAnalysis", + false as usize + 1 + 1 + 1 + 1 + 1 + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "sequence", + &self.sequence, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "lca", + &self.lca, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "taxa", + &self.taxa, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "uniprot_accession_numbers", + &self.uniprot_accession_numbers, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "fa", + &self.fa, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "cutoff_used", + &self.cutoff_used, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Struct representing the search result of the `sequence` in the index (without the analyses) + pub struct SearchOnlyResult { + sequence: String, + proteins: Vec, + cutoff_used: bool, + } + #[automatically_derived] + impl ::core::fmt::Debug for SearchOnlyResult { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field3_finish( + f, + "SearchOnlyResult", + "sequence", + &self.sequence, + "proteins", + &self.proteins, + "cutoff_used", + &&self.cutoff_used, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for SearchOnlyResult { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "SearchOnlyResult", + false as usize + 1 + 1 + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "sequence", + &self.sequence, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "proteins", + &self.proteins, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "cutoff_used", + &self.cutoff_used, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Struct that represents all information known about a certain protein in our database + pub struct ProteinInfo { + taxon: usize, + uniprot_accession: String, + functional_annotations: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for ProteinInfo { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field3_finish( + f, + "ProteinInfo", + "taxon", + &self.taxon, + "uniprot_accession", + &self.uniprot_accession, + "functional_annotations", + &&self.functional_annotations, + ) + } + } + #[doc(hidden)] + #[allow(non_upper_case_globals, unused_attributes, unused_qualifications)] + const _: () = { + #[allow(unused_extern_crates, clippy::useless_attribute)] + extern crate serde as _serde; + #[automatically_derived] + impl _serde::Serialize for ProteinInfo { + fn serialize<__S>( + &self, + __serializer: __S, + ) -> _serde::__private::Result<__S::Ok, __S::Error> + where + __S: _serde::Serializer, + { + let mut __serde_state = _serde::Serializer::serialize_struct( + __serializer, + "ProteinInfo", + false as usize + 1 + 1 + 1, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "taxon", + &self.taxon, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "uniprot_accession", + &self.uniprot_accession, + )?; + _serde::ser::SerializeStruct::serialize_field( + &mut __serde_state, + "functional_annotations", + &self.functional_annotations, + )?; + _serde::ser::SerializeStruct::end(__serde_state) + } + } + }; + /// Searches the `peptide` in the index multithreaded and retrieves the matching proteins + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptide` - The peptide that is being searched in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns Some if matches are found. + /// The first argument is true if the cutoff is used, otherwise false + /// The second argument is a list of all matching proteins for the peptide + /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the + /// sparseness factor k used in the index + pub fn search_proteins_for_peptide<'a>( + searcher: &'a Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> Option<(bool, Vec<&'a Protein>)> { + let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); + if peptide.len() < searcher.sparseness_factor as usize { + return None; + } + let suffix_search = searcher + .search_matching_suffixes(peptide.as_bytes(), cutoff, equalize_i_and_l); + let mut cutoff_used = false; + let suffixes = match suffix_search { + SearchAllSuffixesResult::MaxMatches(matched_suffixes) => { + cutoff_used = true; + matched_suffixes + } + SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, + SearchAllSuffixesResult::NoMatches => { + return None; + } + }; + let mut proteins = searcher.retrieve_proteins(&suffixes); + if clean_taxa { + proteins.retain(|protein| searcher.taxon_valid(protein)) + } + Some((cutoff_used, proteins)) + } + /// Searches the `peptide` in the index multithreaded and retrieves the protein information from the + /// database This does NOT perform any of the analyses, it only retrieves the functional and + /// taxonomic annotations + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptide` - The peptide that is being searched in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns Some(SearchOnlyResult) if the peptide has matches + /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the + /// sparseness factor k used in the index + pub fn search_peptide_retrieve_annotations( + searcher: &Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> Option { + let (cutoff_used, proteins) = search_proteins_for_peptide( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + )?; + let annotations = searcher.get_all_functional_annotations(&proteins); + let mut protein_info: Vec = ::alloc::vec::Vec::new(); + for (&protein, annotations) in proteins.iter().zip(annotations) { + protein_info + .push(ProteinInfo { + taxon: protein.taxon_id, + uniprot_accession: protein.uniprot_id.clone(), + functional_annotations: annotations, + }) + } + Some(SearchOnlyResult { + sequence: peptide.to_string(), + proteins: protein_info, + cutoff_used, + }) + } + /// Searches the `peptide` in the index multithreaded and performs the taxonomic and functional + /// analyses + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptide` - The peptide that is being searched in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns Some(SearchResultWithAnalysis) if the peptide has matches + /// Returns None if the peptides does not have any matches, or if the peptide is shorter than the + /// sparseness factor k used in the index + pub fn analyse_peptide( + searcher: &Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> Option { + let (cutoff_used, mut proteins) = search_proteins_for_peptide( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + )?; + if clean_taxa { + proteins.retain(|protein| searcher.taxon_valid(protein)) + } + let lca = if cutoff_used { Some(1) } else { searcher.retrieve_lca(&proteins) }; + lca?; + let mut uniprot_accession_numbers = ::alloc::vec::Vec::new(); + let mut taxa = ::alloc::vec::Vec::new(); + for protein in &proteins { + taxa.push(protein.taxon_id); + uniprot_accession_numbers.push(protein.uniprot_id.clone()); + } + let fa = searcher.retrieve_function(&proteins); + Some(SearchResultWithAnalysis { + sequence: peptide.to_string(), + lca, + cutoff_used, + uniprot_accession_numbers, + taxa, + fa, + }) + } + /// Searches the list of `peptides` in the index multithreaded and performs the functional and + /// taxonomic analyses + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptides` - List of peptides we want to search in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns an `OutputData` object with the search and analyses results + /// for the peptides + pub fn analyse_all_peptides( + searcher: &Searcher, + peptides: &Vec, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> OutputData { + let res: Vec = peptides + .par_iter() + .map(|peptide| analyse_peptide( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + )) + .filter_map(|search_result| search_result) + .collect(); + OutputData { result: res } + } + /// Searches the list of `peptides` in the index and retrieves all related information about the + /// found proteins This does NOT perform any of the analyses + /// + /// # Arguments + /// * `searcher` - The Searcher which contains the protein database + /// * `peptides` - List of peptides we want to search in the index + /// * `cutoff` - The maximum amount of matches we want to process from the index + /// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search + /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the + /// taxonomy + /// + /// # Returns + /// + /// Returns an `OutputData` object with the search results for the peptides + pub fn search_all_peptides( + searcher: &Searcher, + peptides: &Vec, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, + ) -> OutputData { + let res: Vec = peptides + .par_iter() + .map(|peptide| { + search_peptide_retrieve_annotations( + searcher, + peptide, + cutoff, + equalize_i_and_l, + clean_taxa, + ) + }) + .filter_map(|search_result| search_result) + .collect(); + OutputData { result: res } + } +} +pub mod sa_searcher { + use std::cmp::min; + use sa_mappings::{ + functionality::{FunctionAggregator, FunctionalAggregation}, + proteins::{Protein, Proteins}, + taxonomy::TaxonAggregator, + }; + use umgap::taxon::TaxonId; + use crate::{ + define_struct, sa_searcher::BoundSearch::{Maximum, Minimum}, + suffix_to_protein_index::SuffixToProteinIndex, Nullable, SuffixArray, + }; + /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array + enum BoundSearch { + Minimum, + Maximum, + } + #[automatically_derived] + impl ::core::clone::Clone for BoundSearch { + #[inline] + fn clone(&self) -> BoundSearch { + *self + } + } + #[automatically_derived] + impl ::core::marker::Copy for BoundSearch {} + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for BoundSearch {} + #[automatically_derived] + impl ::core::cmp::PartialEq for BoundSearch { + #[inline] + fn eq(&self, other: &BoundSearch) -> bool { + let __self_tag = ::core::intrinsics::discriminant_value(self); + let __arg1_tag = ::core::intrinsics::discriminant_value(other); + __self_tag == __arg1_tag + } + } + /// Enum representing the minimum and maximum bound of the found matches in the suffix array + pub enum BoundSearchResult { + NoMatches, + SearchResult((usize, usize)), + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for BoundSearchResult {} + #[automatically_derived] + impl ::core::cmp::PartialEq for BoundSearchResult { + #[inline] + fn eq(&self, other: &BoundSearchResult) -> bool { + let __self_tag = ::core::intrinsics::discriminant_value(self); + let __arg1_tag = ::core::intrinsics::discriminant_value(other); + __self_tag == __arg1_tag + && match (self, other) { + ( + BoundSearchResult::SearchResult(__self_0), + BoundSearchResult::SearchResult(__arg1_0), + ) => *__self_0 == *__arg1_0, + _ => true, + } + } + } + #[automatically_derived] + impl ::core::fmt::Debug for BoundSearchResult { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + match self { + BoundSearchResult::NoMatches => { + ::core::fmt::Formatter::write_str(f, "NoMatches") + } + BoundSearchResult::SearchResult(__self_0) => { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "SearchResult", + &__self_0, + ) + } + } + } + } + /// Enum representing the matching suffixes after searching a peptide in the suffix array + /// Both the MaxMatches and SearchResult indicate found suffixes, but MaxMatches is used when the + /// cutoff is reached. + pub enum SearchAllSuffixesResult { + NoMatches, + MaxMatches(Vec), + SearchResult(Vec), + } + #[automatically_derived] + impl ::core::fmt::Debug for SearchAllSuffixesResult { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + match self { + SearchAllSuffixesResult::NoMatches => { + ::core::fmt::Formatter::write_str(f, "NoMatches") + } + SearchAllSuffixesResult::MaxMatches(__self_0) => { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "MaxMatches", + &__self_0, + ) + } + SearchAllSuffixesResult::SearchResult(__self_0) => { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "SearchResult", + &__self_0, + ) + } + } + } + } + /// Custom implementation of partialEq for SearchAllSuffixesResult + /// We consider 2 SearchAllSuffixesResult equal if they exist of the same key, and the Vec contains + /// the same values, but the order can be different + impl PartialEq for SearchAllSuffixesResult { + fn eq(&self, other: &Self) -> bool { + /// Returns true if `arr1` and `arr2` contains the same elements, the order of the elements + /// is ignored + /// + /// # Arguments + /// * `arr1` - The first array used in the comparison + /// * `arr2` - The second array used in the comparison + /// + /// # Returns + /// + /// Returns true if arr1 and arr2 contains the same elements, the order of the elements is + /// ignored + fn array_eq_unordered(arr1: &[i64], arr2: &[i64]) -> bool { + let mut arr1_copy = arr1.to_owned(); + let mut arr2_copy = arr2.to_owned(); + arr1_copy.sort(); + arr2_copy.sort(); + arr1_copy == arr2_copy + } + match (self, other) { + ( + SearchAllSuffixesResult::MaxMatches(arr1), + SearchAllSuffixesResult::MaxMatches(arr2), + ) => array_eq_unordered(arr1, arr2), + ( + SearchAllSuffixesResult::SearchResult(arr1), + SearchAllSuffixesResult::SearchResult(arr2), + ) => array_eq_unordered(arr1, arr2), + ( + SearchAllSuffixesResult::NoMatches, + SearchAllSuffixesResult::NoMatches, + ) => true, + _ => false, + } + } + } + /// Struct that contains all the elements needed to search a peptide in the suffix array + /// This struct also contains all the functions used for search + /// + /// # Arguments + /// * `sa` - The sparse suffix array representing the protein database + /// * `sparseness_factor` - The sparseness factor used by the suffix array + /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix is + /// part of + /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the + /// taxonomic analysis provided by Unipept + /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate + /// the functional analysis provided by Unipept + pub struct Searcher {} + impl Searcher { + /// Creates a new Searcher object + /// + /// # Arguments + /// * `sa` - The sparse suffix array representing the protein database + /// * `sparseness_factor` - The sparseness factor used by the suffix array + /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix + /// is part of + /// * `proteins` - List of all the proteins where the suffix array is build on + /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the + /// taxonomic analysis provided by Unipept + /// * `function_aggregator` - Object used to retrieve the functional annotations and to + /// calculate the functional analysis provided by Unipept + /// + /// # Returns + /// + /// Returns a new Searcher object + pub fn new( + sa: SuffixArray, + sparseness_factor: u8, + suffix_index_to_protein: Box, + proteins: Proteins, + taxon_id_calculator: TaxonAggregator, + function_aggregator: FunctionAggregator, + ) -> Self { + Self { + sa, + sparseness_factor, + suffix_index_to_protein, + proteins, + taxon_id_calculator, + function_aggregator, + } + } + /// Compares the `search_string` to the `suffix` + /// During search this function performs extra logic since the suffix array is build with I == + /// L, while ` self.proteins.input_string` is the original text where I != L + /// + /// # Arguments + /// * `search_string` - The string/peptide being searched in the suffix array + /// * `suffix` - The current suffix from the suffix array we are comparing with in the binary + /// search + /// * `skip` - How many characters we can skip in the comparison because we already know these + /// match + /// * `bound` - Indicates if we are searching for the min of max bound + /// + /// # Returns + /// + /// The first argument is true if `bound` == `Minimum` and `search_string` <= `suffix` or if + /// `bound` == `Maximum` and `search_string` >= `suffix` The second argument indicates how + /// far the `suffix` and `search_string` matched + fn compare( + &self, + search_string: &[u8], + suffix: i64, + skip: usize, + bound: BoundSearch, + ) -> (bool, usize) { + let mut index_in_suffix = (suffix as usize) + skip; + let mut index_in_search_string = skip; + let mut is_cond_or_equal = false; + let condition_check = match bound { + Minimum => |a: u8, b: u8| a < b, + Maximum => |a: u8, b: u8| a > b, + }; + while index_in_search_string < search_string.len() + && index_in_suffix < self.proteins.input_string.len() + && (search_string[index_in_search_string] + == self.proteins.input_string[index_in_suffix] + || (search_string[index_in_search_string] == b'L' + && self.proteins.input_string[index_in_suffix] == b'I') + || (search_string[index_in_search_string] == b'I' + && self.proteins.input_string[index_in_suffix] == b'L')) + { + index_in_suffix += 1; + index_in_search_string += 1; + } + if !search_string.is_empty() { + if index_in_search_string == search_string.len() { + is_cond_or_equal = true + } else if index_in_suffix < self.proteins.input_string.len() { + let peptide_char = if search_string[index_in_search_string] == b'L' { + b'I' + } else { + search_string[index_in_search_string] + }; + let protein_char = if self.proteins.input_string[index_in_suffix] + == b'L' + { + b'I' + } else { + self.proteins.input_string[index_in_suffix] + }; + is_cond_or_equal = condition_check(peptide_char, protein_char); + } + } + (is_cond_or_equal, index_in_search_string) + } + /// Searches for the minimum or maximum bound for a string in the suffix array + /// + /// # Arguments + /// * `bound` - Indicates if we are searching the minimum or maximum bound + /// * `search_string` - The string/peptide we are searching in the suffix array + /// + /// # Returns + /// + /// The first argument is true if a match was found + /// The second argument indicates the index of the minimum or maximum bound for the match + /// (depending on `bound`) + fn binary_search_bound( + &self, + bound: BoundSearch, + search_string: &[u8], + ) -> (bool, usize) { + let mut left: usize = 0; + let mut right: usize = self.sa.len(); + let mut lcp_left: usize = 0; + let mut lcp_right: usize = 0; + let mut found = false; + while right - left > 1 { + let center = (left + right) / 2; + let skip = min(lcp_left, lcp_right); + let (retval, lcp_center) = self + .compare(search_string, self.sa.get(center), skip, bound); + found |= lcp_center == search_string.len(); + if retval && bound == Minimum || !retval && bound == Maximum { + right = center; + lcp_right = lcp_center; + } else { + left = center; + lcp_left = lcp_center; + } + } + if right == 1 && left == 0 { + let (retval, lcp_center) = self + .compare( + search_string, + self.sa.get(0), + min(lcp_left, lcp_right), + bound, + ); + found |= lcp_center == search_string.len(); + if bound == Minimum && retval { + right = 0; + } + } + match bound { + Minimum => (found, right), + Maximum => (found, left), + } + } + /// Searches for the minimum and maximum bound for a string in the suffix array + /// + /// # Arguments + /// * `search_string` - The string/peptide we are searching in the suffix array + /// + /// # Returns + /// + /// Returns the minimum and maximum bound of all matches in the suffix array, or `NoMatches` if + /// no matches were found + pub fn search_bounds(&self, search_string: &[u8]) -> BoundSearchResult { + let (found_min, min_bound) = self + .binary_search_bound(Minimum, search_string); + if !found_min { + return BoundSearchResult::NoMatches; + } + let (_, max_bound) = self.binary_search_bound(Maximum, search_string); + BoundSearchResult::SearchResult((min_bound, max_bound + 1)) + } + /// Searches for the suffixes matching a search string + /// During search I and L can be equated + /// + /// # Arguments + /// * `search_string` - The string/peptide we are searching in the suffix array + /// * `max_matches` - The maximum amount of matches processed, if more matches are found we + /// don't process them + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns all the matching suffixes + #[inline] + pub fn search_matching_suffixes( + &self, + search_string: &[u8], + max_matches: usize, + equalize_i_and_l: bool, + ) -> SearchAllSuffixesResult { + let mut matching_suffixes: Vec = ::alloc::vec::Vec::new(); + let mut il_locations = ::alloc::vec::Vec::new(); + for (i, &character) in search_string.iter().enumerate() { + if character == b'I' || character == b'L' { + il_locations.push(i); + } + } + let mut skip: usize = 0; + while skip < self.sparseness_factor as usize { + let mut il_locations_start = 0; + while il_locations_start < il_locations.len() + && il_locations[il_locations_start] < skip + { + il_locations_start += 1; + } + let il_locations_current_suffix = &il_locations[il_locations_start..]; + let current_search_string_prefix = &search_string[..skip]; + let current_search_string_suffix = &search_string[skip..]; + let search_bound_result = self.search_bounds(&search_string[skip..]); + if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result { + let mut sa_index = min_bound; + while sa_index < max_bound { + let suffix = self.sa.get(sa_index) as usize; + if suffix >= skip + && ((skip == 0 + || Self::check_prefix( + current_search_string_prefix, + &self.proteins.input_string[suffix - skip..suffix], + equalize_i_and_l, + )) + && Self::check_suffix( + skip, + il_locations_current_suffix, + current_search_string_suffix, + &self + .proteins + .input_string[suffix..suffix + search_string.len() - skip], + equalize_i_and_l, + )) + { + matching_suffixes.push((suffix - skip) as i64); + if matching_suffixes.len() >= max_matches { + return SearchAllSuffixesResult::MaxMatches( + matching_suffixes, + ); + } + } + sa_index += 1; + } + } + skip += 1; + } + if matching_suffixes.is_empty() { + SearchAllSuffixesResult::NoMatches + } else { + SearchAllSuffixesResult::SearchResult(matching_suffixes) + } + } + /// Returns true of the prefixes are the same + /// if `equalize_i_and_l` is set to true, L and I are considered the same + /// + /// # Arguments + /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched + /// * `index_prefix` - The unchecked prefix from the protein from the suffix array + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise + /// false + #[inline] + fn check_prefix( + search_string_prefix: &[u8], + index_prefix: &[u8], + equalize_i_and_l: bool, + ) -> bool { + if equalize_i_and_l { + search_string_prefix + .iter() + .zip(index_prefix) + .all(|(&search_character, &index_character)| { + search_character == index_character + || (search_character == b'I' && index_character == b'L') + || (search_character == b'L' && index_character == b'I') + }) + } else { + search_string_prefix == index_prefix + } + } + /// Returns true of the search_string and index_string are equal + /// This is automatically true if `equalize_i_and_l` is set to true, since there matched during + /// search where I = L If `equalize_i_and_l` is set to false, we need to check if the I and + /// L locations have the same character + /// + /// # Arguments + /// * `skip` - The used skip factor during the search iteration + /// * `il_locations` - The locations of the I's and L's in the **original** peptide + /// * `search_string` - The peptide that is being searched, but already with the skipped prefix + /// removed from it + /// * `index_string` - The suffix that search_string matches with when I and L were equalized + /// during search + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns true if `search_string` and `index_string` are considered the same, otherwise false + fn check_suffix( + skip: usize, + il_locations: &[usize], + search_string: &[u8], + index_string: &[u8], + equalize_i_and_l: bool, + ) -> bool { + if equalize_i_and_l { + true + } else { + for &il_location in il_locations { + let index = il_location - skip; + if search_string[index] != index_string[index] { + return false; + } + } + true + } + } + /// Returns all the proteins that correspond with the provided suffixes + /// + /// # Arguments + /// * `suffixes` - List of suffix indices + /// + /// # Returns + /// + /// Returns the proteins that every suffix is a part of + #[inline] + pub fn retrieve_proteins(&self, suffixes: &Vec) -> Vec<&Protein> { + let mut res = ::alloc::vec::Vec::new(); + for &suffix in suffixes { + let protein_index = self + .suffix_index_to_protein + .suffix_to_protein(suffix); + if !protein_index.is_null() { + res.push(&self.proteins[protein_index as usize]); + } + } + res + } + /// Searches all the matching proteins for a search_string/peptide in the suffix array + /// + /// # Arguments + /// * `search_string` - The string/peptide being searched + /// * `equalize_i_and_l` - If set to true, I and L are equalized during search + /// + /// # Returns + /// + /// Returns the matching proteins for the search_string + pub fn search_proteins_for_peptide( + &self, + search_string: &[u8], + equalize_i_and_l: bool, + ) -> Vec<&Protein> { + let mut matching_suffixes = ::alloc::vec::Vec::new(); + if let SearchAllSuffixesResult::SearchResult(suffixes) = self + .search_matching_suffixes(search_string, usize::MAX, equalize_i_and_l) + { + matching_suffixes = suffixes; + } + self.retrieve_proteins(&matching_suffixes) + } + /// Retrieves the taxonomic analysis for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns the taxonomic analysis result for the given list of proteins + #[inline] + pub fn retrieve_lca(&self, proteins: &[&Protein]) -> Option { + let taxon_ids: Vec = proteins + .iter() + .map(|prot| prot.taxon_id) + .collect(); + self.taxon_id_calculator + .aggregate(taxon_ids) + .map(|id| self.taxon_id_calculator.snap_taxon(id)) + } + /// Returns true if the protein is considered valid by the provided taxonomy file + /// + /// # Arguments + /// * `protein` - A protein of which we want to check the validity + /// + /// # Returns + /// + /// Returns true if the protein is considered valid by the provided taxonomy file + pub fn taxon_valid(&self, protein: &Protein) -> bool { + self.taxon_id_calculator.taxon_valid(protein.taxon_id) + } + /// Retrieves the functional analysis for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns the functional analysis result for the given list of proteins + pub fn retrieve_function( + &self, + proteins: &[&Protein], + ) -> Option { + let res = self.function_aggregator.aggregate(proteins.to_vec()); + Some(res) + } + /// Retrieves the all the functional annotations for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns all functional annotations for a collection of proteins + pub fn get_all_functional_annotations( + &self, + proteins: &[&Protein], + ) -> Vec> { + self.function_aggregator.get_all_functional_annotations(proteins) + } + } +} +pub mod suffix_to_protein_index { + use clap::ValueEnum; + use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; + use crate::Nullable; + /// Enum used to define the commandline arguments and choose which index style is used + pub enum SuffixToProteinMappingStyle { + Dense, + Sparse, + } + #[allow( + dead_code, + unreachable_code, + unused_variables, + unused_braces, + unused_qualifications, + )] + #[allow( + clippy::style, + clippy::complexity, + clippy::pedantic, + clippy::restriction, + clippy::perf, + clippy::deprecated, + clippy::nursery, + clippy::cargo, + clippy::suspicious_else_formatting, + clippy::almost_swapped, + clippy::redundant_locals, + )] + #[automatically_derived] + impl clap::ValueEnum for SuffixToProteinMappingStyle { + fn value_variants<'a>() -> &'a [Self] { + &[Self::Dense, Self::Sparse] + } + fn to_possible_value<'a>( + &self, + ) -> ::std::option::Option { + match self { + Self::Dense => Some({ clap::builder::PossibleValue::new("dense") }), + Self::Sparse => Some({ clap::builder::PossibleValue::new("sparse") }), + _ => None, + } + } + } + #[automatically_derived] + impl ::core::clone::Clone for SuffixToProteinMappingStyle { + #[inline] + fn clone(&self) -> SuffixToProteinMappingStyle { + match self { + SuffixToProteinMappingStyle::Dense => SuffixToProteinMappingStyle::Dense, + SuffixToProteinMappingStyle::Sparse => { + SuffixToProteinMappingStyle::Sparse + } + } + } + } + #[automatically_derived] + impl ::core::fmt::Debug for SuffixToProteinMappingStyle { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::write_str( + f, + match self { + SuffixToProteinMappingStyle::Dense => "Dense", + SuffixToProteinMappingStyle::Sparse => "Sparse", + }, + ) + } + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for SuffixToProteinMappingStyle {} + #[automatically_derived] + impl ::core::cmp::PartialEq for SuffixToProteinMappingStyle { + #[inline] + fn eq(&self, other: &SuffixToProteinMappingStyle) -> bool { + let __self_tag = ::core::intrinsics::discriminant_value(self); + let __arg1_tag = ::core::intrinsics::discriminant_value(other); + __self_tag == __arg1_tag + } + } + /// Trait implemented by the SuffixToProtein mappings + pub trait SuffixToProteinIndex: Send + Sync { + /// Returns the index of the protein in the protein list for the given suffix + /// + /// # Arguments + /// * `suffix` - The suffix of which we want to know of which protein it is a part + /// + /// # Returns + /// + /// Returns the index of the protein in the proteins list of which the suffix is a part + fn suffix_to_protein(&self, suffix: i64) -> u32; + } + /// Mapping that uses O(n) memory with n the size of the input text, but retrieval of the protein is + /// in O(1) + pub struct DenseSuffixToProtein { + mapping: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for DenseSuffixToProtein { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field1_finish( + f, + "DenseSuffixToProtein", + "mapping", + &&self.mapping, + ) + } + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for DenseSuffixToProtein {} + #[automatically_derived] + impl ::core::cmp::PartialEq for DenseSuffixToProtein { + #[inline] + fn eq(&self, other: &DenseSuffixToProtein) -> bool { + self.mapping == other.mapping + } + } + /// Mapping that uses O(m) memory with m the number of proteins, but retrieval of the protein is + /// O(log m) + pub struct SparseSuffixToProtein { + mapping: Vec, + } + #[automatically_derived] + impl ::core::fmt::Debug for SparseSuffixToProtein { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_struct_field1_finish( + f, + "SparseSuffixToProtein", + "mapping", + &&self.mapping, + ) + } + } + #[automatically_derived] + impl ::core::marker::StructuralPartialEq for SparseSuffixToProtein {} + #[automatically_derived] + impl ::core::cmp::PartialEq for SparseSuffixToProtein { + #[inline] + fn eq(&self, other: &SparseSuffixToProtein) -> bool { + self.mapping == other.mapping + } + } + impl SuffixToProteinIndex for DenseSuffixToProtein { + fn suffix_to_protein(&self, suffix: i64) -> u32 { + self.mapping[suffix as usize] + } + } + impl SuffixToProteinIndex for SparseSuffixToProtein { + fn suffix_to_protein(&self, suffix: i64) -> u32 { + let protein_index = self + .mapping + .binary_search(&suffix) + .unwrap_or_else(|index| index - 1); + if self.mapping[protein_index + 1] == suffix + 1 { + return u32::NULL; + } + protein_index as u32 + } + } + impl DenseSuffixToProtein { + /// Creates a new DenseSuffixToProtein mapping + /// + /// # Arguments + /// * `text` - The text over which we want to create the mapping + /// + /// # Returns + /// + /// Returns a new DenseSuffixToProtein build over the provided text + pub fn new(text: &[u8]) -> Self { + let mut current_protein_index: u32 = 0; + let mut suffix_index_to_protein: Vec = ::alloc::vec::Vec::new(); + for &char in text.iter() { + if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { + current_protein_index += 1; + suffix_index_to_protein.push(u32::NULL); + } else { + match (¤t_protein_index, &u32::NULL) { + (left_val, right_val) => { + if *left_val == *right_val { + let kind = ::core::panicking::AssertKind::Ne; + ::core::panicking::assert_failed( + kind, + &*left_val, + &*right_val, + ::core::option::Option::None, + ); + } + } + }; + suffix_index_to_protein.push(current_protein_index); + } + } + suffix_index_to_protein.shrink_to_fit(); + DenseSuffixToProtein { + mapping: suffix_index_to_protein, + } + } + } + impl SparseSuffixToProtein { + /// Creates a new SparseSuffixToProtein mapping + /// + /// # Arguments + /// * `text` - The text over which we want to create the mapping + /// + /// # Returns + /// + /// Returns a new SparseSuffixToProtein build over the provided text + pub fn new(text: &[u8]) -> Self { + let mut suffix_index_to_protein: Vec = <[_]>::into_vec( + #[rustc_box] + ::alloc::boxed::Box::new([0]), + ); + for (index, &char) in text.iter().enumerate() { + if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { + suffix_index_to_protein.push(index as i64 + 1); + } + } + suffix_index_to_protein.shrink_to_fit(); + SparseSuffixToProtein { + mapping: suffix_index_to_protein, + } + } + } +} +/// Represents a suffix array. +pub enum SuffixArray { + /// The original suffix array. + Original(Vec), + /// The compressed suffix array. + Compressed(BitArray), +} +impl SuffixArray { + /// Returns the length of the suffix array. + /// + /// # Returns + /// + /// The length of the suffix array. + pub fn len(&self) -> usize { + match self { + SuffixArray::Original(sa) => sa.len(), + SuffixArray::Compressed(sa) => sa.len(), + } + } + /// Returns the suffix array at the given index. + /// + /// # Arguments + /// + /// * `index` - The index of the suffix array. + /// + /// # Returns + /// + /// The suffix array at the given index. + pub fn get(&self, index: usize) -> i64 { + match self { + SuffixArray::Original(sa) => sa[index], + SuffixArray::Compressed(sa) => sa.get(index) as i64, + } + } + /// Returns whether the suffix array is empty. + /// + /// # Returns + /// + /// True if the suffix array is empty, false otherwise. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} +/// Custom trait implemented by types that have a value that represents NULL +pub trait Nullable { + const NULL: T; + /// Returns whether the value is NULL. + /// + /// # Returns + /// + /// True if the value is NULL, false otherwise. + fn is_null(&self) -> bool; +} +/// Implementation of the `Nullable` trait for the `u32` type. +impl Nullable for u32 { + const NULL: u32 = u32::MAX; + fn is_null(&self) -> bool { + *self == Self::NULL + } +} +pub(crate) use define_struct; diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 8e2c0dd..08a535c 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -84,6 +84,7 @@ pub fn search_proteins_for_peptide<'a>( } SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, SearchAllSuffixesResult::NoMatches => { + eprintln!("No matches found for peptide: {}", peptide); return None; } }; @@ -277,3 +278,72 @@ pub fn search_all_peptides( result: res } } + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_json_eq(generated_json: &str, expected_json: &str) { + assert_eq!( + generated_json.parse::().unwrap(), + expected_json.parse::().unwrap(), + ); + } + + #[test] + fn test_serialize_output_data() { + let output_data = OutputData { + result: vec![ 1, 2, 3 ] + }; + + let generated_json = serde_json::to_string(&output_data).unwrap(); + let expected_json = "{\"result\":[1,2,3]}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_search_result_with_analysis() { + let search_result = SearchResultWithAnalysis { + sequence: "MSKIAALLPSV".to_string(), + lca: Some(1), + taxa: vec![1, 2, 3], + uniprot_accession_numbers: vec!["P12345".to_string(), "P23456".to_string()], + fa: None, + cutoff_used: true + }; + + let generated_json = serde_json::to_string(&search_result).unwrap(); + let expected_json = "{\"sequence\":\"MSKIAALLPSV\",\"lca\":1,\"taxa\":[1,2,3],\"uniprot_accession_numbers\":[\"P12345\",\"P23456\"],\"fa\":null,\"cutoff_used\":true}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_protein_info() { + let protein_info = ProteinInfo { + taxon: 1, + uniprot_accession: "P12345".to_string(), + functional_annotations: vec!["GO:0001234".to_string(), "GO:0005678".to_string()] + }; + + let generated_json = serde_json::to_string(&protein_info).unwrap(); + let expected_json = "{\"taxon\":1,\"uniprot_accession\":\"P12345\",\"functional_annotations\":[\"GO:0001234\",\"GO:0005678\"]}"; + + assert_json_eq(&generated_json, expected_json); + } + + #[test] + fn test_serialize_search_only_result() { + let search_result = SearchOnlyResult { + sequence: "MSKIAALLPSV".to_string(), + proteins: vec![], + cutoff_used: true + }; + + let generated_json = serde_json::to_string(&search_result).unwrap(); + let expected_json = "{\"sequence\":\"MSKIAALLPSV\",\"proteins\":[],\"cutoff_used\":true}"; + + assert_json_eq(&generated_json, expected_json); + } +} diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 5bf924b..d5ab080 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -17,10 +17,7 @@ use crate::{ sa_searcher::BoundSearch::{ Maximum, Minimum - }, - suffix_to_protein_index::SuffixToProteinIndex, - Nullable, - SuffixArray + }, suffix_to_protein_index::SuffixToProteinIndex, Nullable, SuffixArray }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -101,12 +98,12 @@ impl PartialEq for SearchAllSuffixesResult { /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate /// the functional analysis provided by Unipept pub struct Searcher { - sa: SuffixArray, + pub sa: SuffixArray, pub sparseness_factor: u8, - suffix_index_to_protein: Box, - proteins: Proteins, - taxon_id_calculator: TaxonAggregator, - function_aggregator: FunctionAggregator + pub suffix_index_to_protein: Box, + pub proteins: Proteins, + pub taxon_id_calculator: TaxonAggregator, + pub function_aggregator: FunctionAggregator } impl Searcher { @@ -548,3 +545,28 @@ impl Searcher { .get_all_functional_annotations(proteins) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_partial_eq_search_all_suffixes_result() { + let search_all_suffixes_result_1 = SearchAllSuffixesResult::SearchResult(vec![1, 2, 3]); + let search_all_suffixes_result_2 = SearchAllSuffixesResult::SearchResult(vec![3, 2, 1]); + let search_all_suffixes_result_3 = SearchAllSuffixesResult::SearchResult(vec![1, 2, 4]); + let search_all_suffixes_result_4 = SearchAllSuffixesResult::MaxMatches(vec![1, 2, 3]); + let search_all_suffixes_result_5 = SearchAllSuffixesResult::MaxMatches(vec![3, 2, 1]); + let search_all_suffixes_result_6 = SearchAllSuffixesResult::MaxMatches(vec![1, 2, 4]); + let search_all_suffixes_result_7 = SearchAllSuffixesResult::NoMatches; + let search_all_suffixes_result_8 = SearchAllSuffixesResult::NoMatches; + + assert_eq!(search_all_suffixes_result_1, search_all_suffixes_result_2); + assert_ne!(search_all_suffixes_result_1, search_all_suffixes_result_3); + assert_eq!(search_all_suffixes_result_4, search_all_suffixes_result_5); + assert_ne!(search_all_suffixes_result_4, search_all_suffixes_result_6); + assert_eq!(search_all_suffixes_result_7, search_all_suffixes_result_8); + assert_ne!(search_all_suffixes_result_1, search_all_suffixes_result_7); + assert_ne!(search_all_suffixes_result_4, search_all_suffixes_result_7); + } +} diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs index a1aabb7..09c330b 100644 --- a/sa-mappings/src/taxonomy.rs +++ b/sa-mappings/src/taxonomy.rs @@ -14,10 +14,7 @@ use umgap::{ mix::MixCalculator }, taxon::{ - read_taxa_file, - TaxonId, - TaxonList, - TaxonTree + read_taxa_file, Taxon, TaxonId, TaxonList, TaxonTree } }; @@ -25,10 +22,8 @@ use umgap::{ pub struct TaxonAggregator { /// A vector that contains the snapped taxon IDs. snapping: Vec>, - /// The aggregator used to aggregate taxon IDs. aggregator: Box, - /// The taxon list. taxon_list: TaxonList } @@ -43,6 +38,33 @@ pub enum AggregationMethod { } impl TaxonAggregator { + /// Creates a new `TaxonAggregator` with the given taxa and aggregation method. + /// + /// # Arguments + /// + /// * `taxa` - A vector of `Taxon` objects representing the taxa. + /// * `method` - An `AggregationMethod` enum specifying the aggregation method to use. + /// + /// # Returns + /// + /// Returns a new `TaxonAggregator` instance. + pub fn new(taxa: Vec, method: AggregationMethod) -> Self { + let taxon_tree = TaxonTree::new(&taxa); + let taxon_list = TaxonList::new(taxa); + let snapping = taxon_tree.snapping(&taxon_list, true); + + let aggregator: Box = match method { + AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), + AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) + }; + + Self { + snapping, + aggregator, + taxon_list + } + } + /// Creates a new `TaxonAggregator` from a taxonomy file and an aggregation method. /// /// # Arguments @@ -62,20 +84,7 @@ impl TaxonAggregator { method: AggregationMethod ) -> Result> { let taxons = read_taxa_file(file)?; - let taxon_tree = TaxonTree::new(&taxons); - let taxon_list = TaxonList::new(taxons); - let snapping = taxon_tree.snapping(&taxon_list, true); - - let aggregator: Box = match method { - AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), - AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) - }; - - Ok(Self { - snapping, - aggregator, - taxon_list - }) + Ok(Self::new(taxons, method)) } /// Checks if a taxon exists in the taxon list. @@ -157,6 +166,7 @@ mod tests { }; use tempdir::TempDir; + use umgap::rank::Rank; use super::*; @@ -183,6 +193,30 @@ mod tests { taxonomy_file } + #[test] + fn test_new() { + TaxonAggregator::new( + vec![ + Taxon::new(1, "root".to_string(), Rank::NoRank, 1, true), + Taxon::new(2, "Bacteria".to_string(), Rank::Superkingdom, 1, true), + Taxon::new(6, "Azorhizobium".to_string(), Rank::Genus, 1, true), + Taxon::new(7, "Azorhizobium caulinodans".to_string(), Rank::Species, 6, true), + Taxon::new(9, "Buchnera aphidicola".to_string(), Rank::Species, 6, true), + Taxon::new(10, "Cellvibrio".to_string(), Rank::Genus, 6, true), + Taxon::new(11, "Cellulomonas gilvus".to_string(), Rank::Species, 10, true), + Taxon::new(13, "Dictyoglomus".to_string(), Rank::Genus, 11, true), + Taxon::new(14, "Dictyoglomus thermophilum".to_string(), Rank::Species, 10, true), + Taxon::new(16, "Methylophilus".to_string(), Rank::Genus, 14, true), + Taxon::new(17, "Methylophilus methylotrophus".to_string(), Rank::Species, 16, true), + Taxon::new(18, "Pelobacter".to_string(), Rank::Genus, 17, true), + Taxon::new(19, "Syntrophotalea carbinolica".to_string(), Rank::Species, 17, true), + Taxon::new(20, "Phenylobacterium".to_string(), Rank::Genus, 19, true), + Taxon::new(21, "Invalid".to_string(), Rank::Species, 19, false) + ], + AggregationMethod::Lca + ); + } + #[test] fn test_try_from_taxonomy_file() { // Create a temporary directory for this test