From cb7afee59a066ac2147688dbc4843ea120435271 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 22 Aug 2024 15:51:41 +0200 Subject: [PATCH] Add a check for trypticity --- sa-index/src/peptide_search.rs | 14 ++++---- sa-index/src/sa_searcher.rs | 58 ++++++++++++++++++++++++++-------- 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 55d629f..2069489 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -50,7 +50,8 @@ pub fn search_proteins_for_peptide<'a>( searcher: &'a Searcher, peptide: &str, cutoff: usize, - equate_il: bool + equate_il: bool, + missed: bool ) -> Option<(bool, Vec<&'a Protein>)> { let peptide = peptide.trim_end().to_uppercase(); @@ -59,7 +60,7 @@ pub fn search_proteins_for_peptide<'a>( return None; } - let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il); + let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il, missed); let (suffixes, cutoff_used) = match suffix_search { SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)), SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)), @@ -71,8 +72,8 @@ pub fn search_proteins_for_peptide<'a>( Some((cutoff_used, proteins)) } -pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool) -> Option { - let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il)?; +pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool, missed: bool) -> Option { + let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il, missed)?; Some(SearchResult { sequence: peptide.to_string(), @@ -99,11 +100,12 @@ pub fn search_all_peptides( searcher: &Searcher, peptides: &Vec, cutoff: usize, - equate_il: bool + equate_il: bool, + missed: bool ) -> Vec { peptides .par_iter() - .filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il)) + .filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il, missed)) .collect() } diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index d09c704..8accf9d 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,6 +1,6 @@ use std::{cmp::min, ops::Deref}; - -use sa_mappings::proteins::{Protein, Proteins}; +use std::str::from_utf8; +use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER}; use crate::{ sa_searcher::BoundSearch::{Maximum, Minimum}, @@ -306,7 +306,8 @@ impl Searcher { &self, search_string: &[u8], max_matches: usize, - equate_il: bool + equate_il: bool, + missed: bool ) -> SearchAllSuffixesResult { let mut matching_suffixes: Vec = vec![]; let mut il_locations = vec![]; @@ -325,15 +326,24 @@ impl Searcher { let il_locations_current_suffix = &il_locations[il_locations_start..]; let current_search_string_prefix = &search_string[..skip]; let current_search_string_suffix = &search_string[skip..]; + + eprintln!("skip: {}, current_search_string_prefix: {}, current_search_string_suffix: {}", skip, from_utf8(current_search_string_prefix).unwrap(), from_utf8(current_search_string_suffix).unwrap()); + let search_bound_result = self.search_bounds(&search_string[skip..]); + + eprintln!("search_bound_result: {:?}", search_bound_result); + // if the shorter part is matched, see if what goes before the matched suffix matches // the unmatched part of the prefix if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result { + eprintln!("min_bound: {}, max_bound: {}", min_bound, max_bound); + // try all the partially matched suffixes and store the matching suffixes in an // array (stop when our max number of matches is reached) let mut sa_index = min_bound; while sa_index < max_bound { let suffix = self.sa.get(sa_index) as usize; + // filter away matches where I was wrongfully equalized to L, and check the // unmatched prefix when I and L equalized, we only need to // check the prefix, not the whole match, when the prefix is 0, we don't need to @@ -353,6 +363,24 @@ impl Searcher { equate_il )) { + if !missed { + let is_tryptic_match = ( + (suffix - skip == 0) + || self.proteins.input_string[suffix - skip - 1] == b'R' + || self.proteins.input_string[suffix - skip - 1] == b'K' + || self.proteins.input_string[suffix - skip - 1] == SEPARATION_CHARACTER + ) && ( + self.proteins.input_string[suffix - skip + search_string.len() + 1] != b'P' + || self.proteins.input_string[suffix - skip + search_string.len() + 1] == SEPARATION_CHARACTER + || self.proteins.input_string[suffix - skip + search_string.len() + 1] == TERMINATION_CHARACTER + ); + + if !is_tryptic_match { + sa_index += 1; + continue; + } + } + matching_suffixes.push((suffix - skip) as i64); // return if max number of matches is reached @@ -545,12 +573,16 @@ mod tests { let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search suffix 'VAA' - let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false); + let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7])); + let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false, false); + assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches); // search suffix 'AC' - let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false); - assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11])); + let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false, true); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![11, 5])); + let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false, false); + assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![11])); } #[test] @@ -578,11 +610,11 @@ mod tests { let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'RIZ' with equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16])); // search bounds 'RIZ' without equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches); } @@ -605,7 +637,7 @@ mod tests { let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0])); } @@ -626,7 +658,7 @@ mod tests { let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); - let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5])); } @@ -647,7 +679,7 @@ mod tests { let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); - let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])); } @@ -670,7 +702,7 @@ mod tests { // search all places where II is in the string IIIILL, but with a sparse SA // this way we check if filtering the suffixes works as expected - let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false); + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2])); } @@ -692,7 +724,7 @@ mod tests { let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])); } }