Skip to content

Commit

Permalink
Add a check for trypticity
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed Aug 22, 2024
1 parent f15e3f8 commit cb7afee
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 19 deletions.
14 changes: 8 additions & 6 deletions sa-index/src/peptide_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ pub fn search_proteins_for_peptide<'a>(
searcher: &'a Searcher,
peptide: &str,
cutoff: usize,
equate_il: bool
equate_il: bool,
missed: bool
) -> Option<(bool, Vec<&'a Protein>)> {
let peptide = peptide.trim_end().to_uppercase();

Expand All @@ -59,7 +60,7 @@ pub fn search_proteins_for_peptide<'a>(
return None;
}

let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il);
let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il, missed);
let (suffixes, cutoff_used) = match suffix_search {
SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)),
SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)),
Expand All @@ -71,8 +72,8 @@ pub fn search_proteins_for_peptide<'a>(
Some((cutoff_used, proteins))
}

pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool) -> Option<SearchResult> {
let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il)?;
pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool, missed: bool) -> Option<SearchResult> {
let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il, missed)?;

Some(SearchResult {
sequence: peptide.to_string(),
Expand All @@ -99,11 +100,12 @@ pub fn search_all_peptides(
searcher: &Searcher,
peptides: &Vec<String>,
cutoff: usize,
equate_il: bool
equate_il: bool,
missed: bool
) -> Vec<SearchResult> {
peptides
.par_iter()
.filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il))
.filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il, missed))
.collect()
}

Expand Down
58 changes: 45 additions & 13 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{cmp::min, ops::Deref};

use sa_mappings::proteins::{Protein, Proteins};
use std::str::from_utf8;
use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER};

use crate::{
sa_searcher::BoundSearch::{Maximum, Minimum},
Expand Down Expand Up @@ -306,7 +306,8 @@ impl Searcher {
&self,
search_string: &[u8],
max_matches: usize,
equate_il: bool
equate_il: bool,
missed: bool
) -> SearchAllSuffixesResult {
let mut matching_suffixes: Vec<i64> = vec![];
let mut il_locations = vec![];
Expand All @@ -325,15 +326,24 @@ impl Searcher {
let il_locations_current_suffix = &il_locations[il_locations_start..];
let current_search_string_prefix = &search_string[..skip];
let current_search_string_suffix = &search_string[skip..];

eprintln!("skip: {}, current_search_string_prefix: {}, current_search_string_suffix: {}", skip, from_utf8(current_search_string_prefix).unwrap(), from_utf8(current_search_string_suffix).unwrap());

let search_bound_result = self.search_bounds(&search_string[skip..]);

eprintln!("search_bound_result: {:?}", search_bound_result);

// if the shorter part is matched, see if what goes before the matched suffix matches
// the unmatched part of the prefix
if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result {
eprintln!("min_bound: {}, max_bound: {}", min_bound, max_bound);

// try all the partially matched suffixes and store the matching suffixes in an
// array (stop when our max number of matches is reached)
let mut sa_index = min_bound;
while sa_index < max_bound {
let suffix = self.sa.get(sa_index) as usize;

// filter away matches where I was wrongfully equalized to L, and check the
// unmatched prefix when I and L equalized, we only need to
// check the prefix, not the whole match, when the prefix is 0, we don't need to
Expand All @@ -353,6 +363,24 @@ impl Searcher {
equate_il
))
{
if !missed {
let is_tryptic_match = (
(suffix - skip == 0)
|| self.proteins.input_string[suffix - skip - 1] == b'R'
|| self.proteins.input_string[suffix - skip - 1] == b'K'
|| self.proteins.input_string[suffix - skip - 1] == SEPARATION_CHARACTER
) && (
self.proteins.input_string[suffix - skip + search_string.len() + 1] != b'P'
|| self.proteins.input_string[suffix - skip + search_string.len() + 1] == SEPARATION_CHARACTER
|| self.proteins.input_string[suffix - skip + search_string.len() + 1] == TERMINATION_CHARACTER
);

if !is_tryptic_match {
sa_index += 1;
continue;
}
}

matching_suffixes.push((suffix - skip) as i64);

// return if max number of matches is reached
Expand Down Expand Up @@ -545,12 +573,16 @@ mod tests {
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));

// search suffix 'VAA'
let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7]));
let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);

// search suffix 'AC'
let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11]));
let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![11, 5]));
let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![11]));
}

#[test]
Expand Down Expand Up @@ -578,11 +610,11 @@ mod tests {
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));

// search bounds 'RIZ' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16]));

// search bounds 'RIZ' without equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false);
let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
}

Expand All @@ -605,7 +637,7 @@ mod tests {
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0]));
}

Expand All @@ -626,7 +658,7 @@ mod tests {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]));
}

Expand All @@ -647,7 +679,7 @@ mod tests {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
}

Expand All @@ -670,7 +702,7 @@ mod tests {

// search all places where II is in the string IIIILL, but with a sparse SA
// this way we check if filtering the suffixes works as expected
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2]));
}

Expand All @@ -692,7 +724,7 @@ mod tests {
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));

// search bounds 'IM' with equal I and L
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, true);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
}
}

0 comments on commit cb7afee

Please sign in to comment.