From a375dda81cdb99a4ce1cd7c7807afa6e3b406337 Mon Sep 17 00:00:00 2001 From: Bram Devlaminck Date: Wed, 22 May 2024 12:21:08 +0200 Subject: [PATCH] add back SA search tests --- Cargo.lock | 1 + sa-index/Cargo.toml | 3 + sa-index/src/sa_searcher.rs | 362 ++++++++++++++++++++++++++++++++++++ sa-mappings/src/proteins.rs | 2 +- 4 files changed, 367 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 27c3eed..d5ed545 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1421,6 +1421,7 @@ dependencies = [ "sa-mappings", "serde", "serde_json", + "tempdir", "umgap", ] diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index c355bef..70acb67 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -5,6 +5,9 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dev-dependencies] +tempdir = "0.3.7" + [dependencies] clap = { version = "4.4.8", features = ["derive"] } umgap = "1.1.0" diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 351e845..78cc043 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -546,3 +546,365 @@ impl Searcher { .get_all_functional_annotations(proteins) } } + +#[cfg(test)] +mod tests { + use sa_mappings::functionality::FunctionAggregator; + use sa_mappings::proteins::{Protein, Proteins}; + use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator}; + use crate::sa_searcher::{ + BoundSearchResult, SearchAllSuffixesResult, Searcher, + }; + use crate::suffix_to_protein_index::SparseSuffixToProtein; + use tempdir::TempDir; + use std::{ + fs::File, + io::Write, + path::PathBuf + }; + + fn create_taxonomy_file(tmp_dir: &TempDir) -> PathBuf { + let taxonomy_file = tmp_dir.path().join("taxonomy.tsv"); + let mut file = File::create(&taxonomy_file).unwrap(); + + writeln!(file, "1\troot\tno rank\t1\t\x01").unwrap(); + writeln!(file, "2\tBacteria\tsuperkingdom\t1\t\x01").unwrap(); + writeln!(file, "6\tAzorhizobium\tgenus\t1\t\x01").unwrap(); + writeln!(file, "7\tAzorhizobium caulinodans\tspecies\t6\t\x01").unwrap(); + writeln!(file, "9\tBuchnera aphidicola\tspecies\t6\t\x01").unwrap(); + writeln!(file, "10\tCellvibrio\tgenus\t6\t\x01").unwrap(); + writeln!(file, "11\tCellulomonas gilvus\tspecies\t10\t\x01").unwrap(); + writeln!(file, "13\tDictyoglomus\tgenus\t11\t\x01").unwrap(); + writeln!(file, "14\tDictyoglomus thermophilum\tspecies\t10\t\x01").unwrap(); + writeln!(file, "16\tMethylophilus\tgenus\t14\t\x01").unwrap(); + writeln!(file, "17\tMethylophilus methylotrophus\tspecies\t16\t\x01").unwrap(); + writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap(); + writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap(); + writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap(); + + taxonomy_file + } + + + fn get_example_proteins() -> Proteins { + let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes(); + Proteins { + input_string: text, + proteins: vec![ + Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }, + Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }, + Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }, + Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }, + ], + } + } + + #[test] + fn test_search_simple() { + let proteins = get_example_proteins(); + let sa = vec![ + 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, + ]; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let searcher = Searcher::new( + sa, + 1, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + // search bounds 'A' + let bounds_res = searcher.search_bounds(&[b'A']); + assert_eq!(bounds_res, BoundSearchResult::SearchResult((4, 9))); + + // search bounds '$' + let bounds_res = searcher.search_bounds(&[b'$']); + assert_eq!(bounds_res, BoundSearchResult::SearchResult((0, 1))); + + // search bounds 'AC' + let bounds_res = searcher.search_bounds(&[b'A', b'C']); + assert_eq!(bounds_res, BoundSearchResult::SearchResult((6, 8))); + } + + #[test] + fn test_search_sparse() { + let proteins = get_example_proteins(); + let sa = vec![9, 0, 3, 12, 15, 6, 18]; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let searcher = Searcher::new( + sa, + 3, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + // search suffix 'VAA' + let found_suffixes = + searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![7]) + ); + + // search suffix 'AC' + let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![5, 11]) + ); + } + + #[test] + fn test_il_equality() { + let proteins = get_example_proteins(); + let sa = vec![ + 19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18, + ]; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let searcher = Searcher::new( + sa, + 1, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + let bounds_res = searcher.search_bounds(&[b'I']); + assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16))); + + // search bounds 'RIZ' with equal I and L + let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Z']); + assert_eq!(bounds_res, BoundSearchResult::SearchResult((17, 18))); + } + + #[test] + fn test_il_equality_sparse() { + let proteins = get_example_proteins(); + let sa = vec![9, 0, 3, 12, 15, 6, 18]; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let searcher = Searcher::new( + sa, + 3, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + // search bounds 'RIZ' with equal I and L + let found_suffixes = + searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![16]) + ); + + // search bounds 'RIZ' without equal I and L + let found_suffixes = + searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false); + assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches); + } + + // test edge case where an I or L is the first index in the sparse SA. + #[test] + fn test_l_first_index_in_sa() { + let text = "LMOXZ$".to_string().into_bytes(); + + let proteins = Proteins { + input_string: text, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }], + }; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let sparse_sa = vec![0, 2, 4]; + let searcher = Searcher::new( + sparse_sa, + 2, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + // search bounds 'IM' with equal I and L + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![0]) + ); + } + + #[test] + fn test_il_missing_matches() { + let text = "AAILLL$".to_string().into_bytes(); + + let proteins = Proteins { + input_string: text, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }], + }; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let sparse_sa = vec![6, 0, 1, 5, 4, 3, 2]; + let searcher = Searcher::new( + sparse_sa, + 1, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5]) + ); + } + + #[test] + fn test_il_duplication() { + let text = "IIIILL$".to_string().into_bytes(); + + let proteins = Proteins { + input_string: text, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }], + }; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0]; + let searcher = Searcher::new( + sparse_sa, + 1, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]) + ); + } + + #[test] + fn test_il_suffix_check() { + let text = "IIIILL$".to_string().into_bytes(); + + let proteins = Proteins { + input_string: text, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }], + }; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let sparse_sa = vec![6, 4, 2, 0]; + let searcher = Searcher::new( + sparse_sa, + 2, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + // search all places where II is in the string IIIILL, but with a sparse SA + // this way we check if filtering the suffixes works as expected + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![0, 1, 2]) + ); + } + + #[test] + fn test_il_duplication2() { + let text = "IILLLL$".to_string().into_bytes(); + + let proteins = Proteins { + input_string: text, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, + functional_annotations: vec![], + }], + }; + + let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); + let taxonomy_file = create_taxonomy_file(&tmp_dir); + + let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0]; + let searcher = Searcher::new( + sparse_sa, + 1, + Box::new(SparseSuffixToProtein::new(&proteins.input_string)), + proteins, + TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(), + FunctionAggregator {} + ); + + // search bounds 'IM' with equal I and L + let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); + assert_eq!( + found_suffixes, + SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4]) + ); + } +} \ No newline at end of file diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index 92fd523..900c531 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -40,7 +40,7 @@ pub struct Proteins { pub input_string: Vec, /// The proteins in the input string - proteins: Vec + pub proteins: Vec } impl Protein {