diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index 639900d..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs
index 55d629f..b3958c7 100644
--- a/sa-index/src/peptide_search.rs
+++ b/sa-index/src/peptide_search.rs
@@ -38,6 +38,7 @@ impl From<&Protein> for ProteinInfo {
/// * `equate_il` - Boolean indicating if we want to equate I and L during search
/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the
/// taxonomy
+/// * `tryptic` - Boolean indicating if we only want tryptic matches.
///
/// # Returns
///
@@ -50,7 +51,8 @@ pub fn search_proteins_for_peptide<'a>(
searcher: &'a Searcher,
peptide: &str,
cutoff: usize,
- equate_il: bool
+ equate_il: bool,
+ tryptic: bool
) -> Option<(bool, Vec<&'a Protein>)> {
let peptide = peptide.trim_end().to_uppercase();
@@ -59,7 +61,7 @@ pub fn search_proteins_for_peptide<'a>(
return None;
}
- let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il);
+ let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il, tryptic);
let (suffixes, cutoff_used) = match suffix_search {
SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)),
SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)),
@@ -71,8 +73,14 @@ pub fn search_proteins_for_peptide<'a>(
Some((cutoff_used, proteins))
}
-pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool) -> Option {
- let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il)?;
+pub fn search_peptide(
+ searcher: &Searcher,
+ peptide: &str,
+ cutoff: usize,
+ equate_il: bool,
+ tryptic: bool
+) -> Option {
+ let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il, tryptic)?;
Some(SearchResult {
sequence: peptide.to_string(),
@@ -91,6 +99,7 @@ pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_
/// * `equate_il` - Boolean indicating if we want to equate I and L during search
/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the
/// taxonomy
+/// * `tryptic` - Boolean indicating if we only want tryptic matches.
///
/// # Returns
///
@@ -99,11 +108,12 @@ pub fn search_all_peptides(
searcher: &Searcher,
peptides: &Vec,
cutoff: usize,
- equate_il: bool
+ equate_il: bool,
+ tryptic: bool
) -> Vec {
peptides
.par_iter()
- .filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il))
+ .filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il, tryptic))
.collect()
}
diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index e9590c8..dab8577 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -1,6 +1,6 @@
use std::{cmp::min, ops::Deref};
-use sa_mappings::proteins::{Protein, Proteins};
+use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER};
use text_compression::ProteinTextSlice;
use crate::{
@@ -296,6 +296,7 @@ impl Searcher {
/// * `max_matches` - The maximum amount of matches processed, if more matches are found we
/// don't process them
/// * `equate_il` - True if we want to equate I and L during search, otherwise false
+ /// * `tryptic` - Boolean indicating if we only want tryptic matches.
///
/// # Returns
///
@@ -305,7 +306,8 @@ impl Searcher {
&self,
search_string: &[u8],
max_matches: usize,
- equate_il: bool
+ equate_il: bool,
+ tryptic: bool
) -> SearchAllSuffixesResult {
let mut matching_suffixes: Vec = vec![];
let mut il_locations = vec![];
@@ -333,30 +335,41 @@ impl Searcher {
let mut sa_index = min_bound;
while sa_index < max_bound {
let suffix = self.sa.get(sa_index) as usize;
- // filter away matches where I was wrongfully equalized to L, and check the
- // unmatched prefix when I and L equalized, we only need to
- // check the prefix, not the whole match, when the prefix is 0, we don't need to
- // check at all
- if suffix >= skip
- && ((skip == 0
- || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix)
- .equals_slice(current_search_string_prefix, equate_il)) // Check the prefix
- &&
- Self::check_suffix(
+
+ if suffix >= skip {
+ let match_start = suffix - skip;
+ let match_end = suffix + search_string.len() - skip;
+
+ // filter away matches where I was wrongfully equalized to L, and check the
+ // unmatched prefix when I and L equalized, we only need to
+ // check the prefix, not the whole match, when the prefix is 0, we don't need to
+ // check at all
+ if (skip == 0
+ || Self::check_prefix(
+ current_search_string_prefix,
+ ProteinTextSlice::new(&self.proteins.text, match_start, suffix),
+ equate_il
+ ))
+ && Self::check_suffix(
skip,
il_locations_current_suffix,
current_search_string_suffix,
- ProteinTextSlice::new(&self.proteins.text, suffix, suffix + search_string.len() - skip),
+ ProteinTextSlice::new(&self.proteins.text, suffix, match_end),
equate_il
- ))
- {
- matching_suffixes.push((suffix - skip) as i64);
-
- // return if max number of matches is reached
- if matching_suffixes.len() >= max_matches {
- return SearchAllSuffixesResult::MaxMatches(matching_suffixes);
+ )
+ && (!tryptic
+ || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
+ && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
+ {
+ matching_suffixes.push((suffix - skip) as i64);
+
+ // return if max number of matches is reached
+ if matching_suffixes.len() >= max_matches {
+ return SearchAllSuffixesResult::MaxMatches(matching_suffixes);
+ }
}
}
+
sa_index += 1;
}
}
@@ -370,6 +383,64 @@ impl Searcher {
}
}
+ /// Check if a cut is the start of a protein.
+ ///
+ /// # Arguments
+ /// * `cut_index` - The index of the cut in the text of proteins.
+ ///
+ /// # Returns
+ ///
+ /// Returns true if the cut is at the start of a protein.
+ #[inline]
+ fn check_start_of_protein(&self, cut_index: usize) -> bool {
+ cut_index == 0 || self.proteins.text.get(cut_index - 1) == SEPARATION_CHARACTER
+ }
+
+ /// Check if a cut is the end of a protein.
+ ///
+ /// # Arguments
+ /// * `cut_index` - The index of the cut in the text of proteins.
+ ///
+ /// # Returns
+ ///
+ /// Returns true if the cut is at the end of a protein.
+ #[inline]
+ fn check_end_of_protein(&self, cut_index: usize) -> bool {
+ self.proteins.text.get(cut_index) == TERMINATION_CHARACTER
+ || self.proteins.text.get(cut_index) == SEPARATION_CHARACTER
+ }
+
+ /// Check if a cut is a tryptic cut, so check if the amino acid preceding the cut is K or R and the amino acid at the cut is not P.
+ ///
+ /// # Arguments
+ /// * `cut_index` - The index of the cut in the text of proteins.
+ ///
+ /// # Returns
+ ///
+ /// Returns true if the cut is a tryptic cut.
+ #[inline]
+ fn check_tryptic_cut(&self, cut_index: usize) -> bool {
+ (self.proteins.text.get(cut_index - 1) == b'K' || self.proteins.text.get(cut_index - 1) == b'R')
+ && self.proteins.text.get(cut_index) != b'P'
+ }
+
+ /// Returns true of the prefixes are the same
+ /// if `equate_il` is set to true, L and I are considered the same
+ ///
+ /// # Arguments
+ /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched
+ /// * `index_prefix` - The unchecked prefix from the protein from the suffix array
+ /// * `equate_il` - True if we want to equate I and L during search, otherwise false
+ ///
+ /// # Returns
+ ///
+ /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise
+ /// false
+ #[inline]
+ fn check_prefix(search_string_prefix: &[u8], index_prefix: ProteinTextSlice, equate_il: bool) -> bool {
+ index_prefix.equals_slice(search_string_prefix, equate_il)
+ }
+
/// Returns true of the search_string and index_string are equal
/// This is automatically true if `equate_il` is set to true, since there matched during
/// search where I = L If `equate_il` is set to false, we need to check if the I and
@@ -510,11 +581,11 @@ mod tests {
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
// search suffix 'VAA'
- let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7]));
// search suffix 'AC'
- let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![5, 11]));
}
@@ -543,11 +614,11 @@ mod tests {
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
// search bounds 'RIZ' with equal I and L
- let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, true);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16]));
// search bounds 'RIZ' without equal I and L
- let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, false);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
}
@@ -571,7 +642,7 @@ mod tests {
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
// search bounds 'IM' with equal I and L
- let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0]));
}
@@ -593,7 +664,7 @@ mod tests {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
- let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]));
}
@@ -615,7 +686,7 @@ mod tests {
let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
- let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
}
@@ -639,7 +710,7 @@ mod tests {
// search all places where II is in the string IIIILL, but with a sparse SA
// this way we check if filtering the suffixes works as expected
- let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2]));
}
@@ -662,7 +733,32 @@ mod tests {
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
// search bounds 'IM' with equal I and L
- let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]));
}
+
+ #[test]
+ fn test_tryptic_search() {
+ let input_string = "PAA-AAKPKAPAA$";
+ let text = ProteinText::from_string(input_string);
+
+ let proteins = Proteins {
+ text,
+ proteins: vec![Protein {
+ uniprot_id: String::new(),
+ taxon_id: 0,
+ functional_annotations: vec![]
+ }]
+ };
+
+ let sparse_sa = SuffixArray::Original(vec![13, 3, 12, 11, 1, 4, 2, 5, 9, 8, 6, 10, 0, 7], 1);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
+ let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
+
+ let found_suffixes_1 = searcher.search_matching_suffixes(&[b'P', b'A', b'A'], usize::MAX, false, true);
+ assert_eq!(found_suffixes_1, SearchAllSuffixesResult::SearchResult(vec![0]));
+
+ let found_suffixes_2 = searcher.search_matching_suffixes(&[b'A', b'P', b'A', b'A'], usize::MAX, false, true);
+ assert_eq!(found_suffixes_2, SearchAllSuffixesResult::SearchResult(vec![9]));
+ }
}
diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs
index 5284546..1a1cedf 100644
--- a/sa-server/src/main.rs
+++ b/sa-server/src/main.rs
@@ -58,7 +58,9 @@ struct InputData {
cutoff: usize,
#[serde(default = "bool::default")]
// default value is false // TODO: maybe default should be true?
- equate_il: bool
+ equate_il: bool,
+ #[serde(default = "bool::default")] // default false
+ tryptic: bool
}
#[tokio::main]
@@ -83,7 +85,7 @@ async fn search(
State(searcher): State>,
data: Json
) -> Result>, StatusCode> {
- let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il);
+ let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il, data.tryptic);
Ok(Json(search_result))
}
diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
index 4866a6c..338e234 100644
--- a/text-compression/src/lib.rs
+++ b/text-compression/src/lib.rs
@@ -24,7 +24,7 @@ impl ProteinText {
/// Returns the hashmap
fn create_char_to_5bit_hashmap() -> HashMap {
let mut hashmap = HashMap::::new();
- for (i, c) in "ACDEFGHIKLMNPQRSTVWY-$".chars().enumerate() {
+ for (i, c) in "ABCDEFGHIKLMNOPQRSTUVWXYZ-$".chars().enumerate() {
hashmap.insert(c as u8, i as u8);
}
@@ -38,7 +38,7 @@ impl ProteinText {
/// Returns the vector
fn create_bit5_to_char() -> Vec {
let mut vec = Vec::::new();
- for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+ for c in "ABCDEFGHIKLMNOPQRSTUVWXYZ-$".chars() {
vec.push(c as u8);
}
vec
@@ -58,7 +58,8 @@ impl ProteinText {
let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
for (i, c) in input_string.chars().enumerate() {
- let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+ let char_5bit: u8 =
+ *char_to_5bit.get(&(c as u8)).unwrap_or_else(|| panic!("Input character '{}' not in alphabet", c));
bit_array.set(i, char_5bit as u64);
}
@@ -79,7 +80,8 @@ impl ProteinText {
let mut bit_array = BitArray::with_capacity(input_vec.len(), 5);
for (i, e) in input_vec.iter().enumerate() {
- let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet");
+ let char_5bit: u8 =
+ *char_to_5bit.get(e).unwrap_or_else(|| panic!("Input character '{}' not in alphabet", e));
bit_array.set(i, char_5bit as u64);
}
@@ -131,7 +133,10 @@ impl ProteinText {
/// * `index` - The index of the character to change.
/// * `value` - The character to fill in as `u8`.
pub fn set(&mut self, index: usize, value: u8) {
- let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet");
+ let char_5bit: u8 = *self
+ .char_to_5bit
+ .get(&value)
+ .unwrap_or_else(|| panic!("Input character '{}' not in alphabet", value));
self.bit_array.set(index, char_5bit as u64);
}
@@ -445,7 +450,7 @@ mod tests {
let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
let bit5_to_char = ProteinText::create_bit5_to_char();
- for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+ for c in "ABCDEFGHIKLMNOPQRSTUVWXYZ-$".chars() {
let char_5bit = char_to_5bit.get(&(c as u8)).unwrap();
assert_eq!(c as u8, bit5_to_char[*char_5bit as usize]);
}
@@ -477,7 +482,8 @@ mod tests {
let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
for (i, c) in input_string.chars().enumerate() {
- let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+ let char_5bit: u8 =
+ *char_to_5bit.get(&(c as u8)).unwrap_or_else(|| panic!("Input character '{}' not in alphabet", c));
bit_array.set(i, char_5bit as u64);
}
@@ -592,7 +598,7 @@ mod tests {
let mut reader = std::io::BufReader::new(&data[..]);
let compressed_text = load_compressed_text(&mut reader).unwrap();
- for (i, c) in "CDEFGHIKLM".chars().enumerate() {
+ for (i, c) in "BCDEFGHIKL".chars().enumerate() {
assert_eq!(compressed_text.get(i), c as u8);
}
}
diff --git a/unipept-index.iml b/unipept-index.iml
index 8021953..ce5666f 100644
--- a/unipept-index.iml
+++ b/unipept-index.iml
@@ -1,8 +1,23 @@
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+