From a375dda81cdb99a4ce1cd7c7807afa6e3b406337 Mon Sep 17 00:00:00 2001
From: Bram Devlaminck <bram.devlaminck@gmail.com>
Date: Wed, 22 May 2024 12:21:08 +0200
Subject: [PATCH] add back SA search tests

---
 Cargo.lock                  |   1 +
 sa-index/Cargo.toml         |   3 +
 sa-index/src/sa_searcher.rs | 362 ++++++++++++++++++++++++++++++++++++
 sa-mappings/src/proteins.rs |   2 +-
 4 files changed, 367 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 27c3eed..d5ed545 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1421,6 +1421,7 @@ dependencies = [
  "sa-mappings",
  "serde",
  "serde_json",
+ "tempdir",
  "umgap",
 ]
 
diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml
index c355bef..70acb67 100644
--- a/sa-index/Cargo.toml
+++ b/sa-index/Cargo.toml
@@ -5,6 +5,9 @@ edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+[dev-dependencies]
+tempdir = "0.3.7"
+
 [dependencies]
 clap = { version = "4.4.8", features = ["derive"] }
 umgap = "1.1.0"
diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index 351e845..78cc043 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -546,3 +546,365 @@ impl Searcher {
             .get_all_functional_annotations(proteins)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use sa_mappings::functionality::FunctionAggregator;
+    use sa_mappings::proteins::{Protein, Proteins};
+    use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator};
+    use crate::sa_searcher::{
+        BoundSearchResult, SearchAllSuffixesResult, Searcher,
+    };
+    use crate::suffix_to_protein_index::SparseSuffixToProtein;
+    use tempdir::TempDir;
+    use std::{
+        fs::File,
+        io::Write,
+        path::PathBuf
+    };
+
+    fn create_taxonomy_file(tmp_dir: &TempDir) -> PathBuf {
+        let taxonomy_file = tmp_dir.path().join("taxonomy.tsv");
+        let mut file = File::create(&taxonomy_file).unwrap();
+
+        writeln!(file, "1\troot\tno rank\t1\t\x01").unwrap();
+        writeln!(file, "2\tBacteria\tsuperkingdom\t1\t\x01").unwrap();
+        writeln!(file, "6\tAzorhizobium\tgenus\t1\t\x01").unwrap();
+        writeln!(file, "7\tAzorhizobium caulinodans\tspecies\t6\t\x01").unwrap();
+        writeln!(file, "9\tBuchnera aphidicola\tspecies\t6\t\x01").unwrap();
+        writeln!(file, "10\tCellvibrio\tgenus\t6\t\x01").unwrap();
+        writeln!(file, "11\tCellulomonas gilvus\tspecies\t10\t\x01").unwrap();
+        writeln!(file, "13\tDictyoglomus\tgenus\t11\t\x01").unwrap();
+        writeln!(file, "14\tDictyoglomus thermophilum\tspecies\t10\t\x01").unwrap();
+        writeln!(file, "16\tMethylophilus\tgenus\t14\t\x01").unwrap();
+        writeln!(file, "17\tMethylophilus methylotrophus\tspecies\t16\t\x01").unwrap();
+        writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap();
+        writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap();
+        writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap();
+
+        taxonomy_file
+    }
+
+
+    fn get_example_proteins() -> Proteins {
+        let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes();
+        Proteins {
+            input_string: text,
+            proteins: vec![
+                Protein {
+                    uniprot_id: String::new(),
+                    taxon_id: 0,
+                    functional_annotations: vec![],
+                },
+                Protein {
+                    uniprot_id: String::new(),
+                    taxon_id: 0,
+                    functional_annotations: vec![],
+                },
+                Protein {
+                    uniprot_id: String::new(),
+                    taxon_id: 0,
+                    functional_annotations: vec![],
+                },
+                Protein {
+                    uniprot_id: String::new(),
+                    taxon_id: 0,
+                    functional_annotations: vec![],
+                },
+            ],
+        }
+    }
+
+    #[test]
+    fn test_search_simple() {
+        let proteins = get_example_proteins();
+        let sa = vec![
+            19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18,
+        ];
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let searcher = Searcher::new(
+            sa,
+            1,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        // search bounds 'A'
+        let bounds_res = searcher.search_bounds(&[b'A']);
+        assert_eq!(bounds_res, BoundSearchResult::SearchResult((4, 9)));
+
+        // search bounds '$'
+        let bounds_res = searcher.search_bounds(&[b'$']);
+        assert_eq!(bounds_res, BoundSearchResult::SearchResult((0, 1)));
+
+        // search bounds 'AC'
+        let bounds_res = searcher.search_bounds(&[b'A', b'C']);
+        assert_eq!(bounds_res, BoundSearchResult::SearchResult((6, 8)));
+    }
+
+    #[test]
+    fn test_search_sparse() {
+        let proteins = get_example_proteins();
+        let sa = vec![9, 0, 3, 12, 15, 6, 18];
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let searcher = Searcher::new(
+            sa,
+            3,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        // search suffix 'VAA'
+        let found_suffixes =
+            searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![7])
+        );
+
+        // search suffix 'AC'
+        let found_suffixes = searcher.search_matching_suffixes(&[b'A', b'C'], usize::MAX, false);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![5, 11])
+        );
+    }
+
+    #[test]
+    fn test_il_equality() {
+        let proteins = get_example_proteins();
+        let sa = vec![
+            19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18,
+        ];
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let searcher = Searcher::new(
+            sa,
+            1,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        let bounds_res = searcher.search_bounds(&[b'I']);
+        assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16)));
+
+        // search bounds 'RIZ' with equal I and L
+        let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Z']);
+        assert_eq!(bounds_res, BoundSearchResult::SearchResult((17, 18)));
+    }
+
+    #[test]
+    fn test_il_equality_sparse() {
+        let proteins = get_example_proteins();
+        let sa = vec![9, 0, 3, 12, 15, 6, 18];
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let searcher = Searcher::new(
+            sa,
+            3,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        // search bounds 'RIZ' with equal I and L
+        let found_suffixes =
+            searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![16])
+        );
+
+        // search bounds 'RIZ' without equal I and L
+        let found_suffixes =
+            searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false);
+        assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
+    }
+
+    // test edge case where an I or L is the first index in the sparse SA.
+    #[test]
+    fn test_l_first_index_in_sa() {
+        let text = "LMOXZ$".to_string().into_bytes();
+
+        let proteins = Proteins {
+            input_string: text,
+            proteins: vec![Protein {
+                uniprot_id: String::new(),
+                taxon_id: 0,
+                functional_annotations: vec![],
+            }],
+        };
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let sparse_sa = vec![0, 2, 4];
+        let searcher = Searcher::new(
+            sparse_sa,
+            2,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        // search bounds 'IM' with equal I and L
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![0])
+        );
+    }
+
+    #[test]
+    fn test_il_missing_matches() {
+        let text = "AAILLL$".to_string().into_bytes();
+
+        let proteins = Proteins {
+            input_string: text,
+            proteins: vec![Protein {
+                uniprot_id: String::new(),
+                taxon_id: 0,
+                functional_annotations: vec![],
+            }],
+        };
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let sparse_sa = vec![6, 0, 1, 5, 4, 3, 2];
+        let searcher = Searcher::new(
+            sparse_sa,
+            1,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5])
+        );
+    }
+
+    #[test]
+    fn test_il_duplication() {
+        let text = "IIIILL$".to_string().into_bytes();
+
+        let proteins = Proteins {
+            input_string: text,
+            proteins: vec![Protein {
+                uniprot_id: String::new(),
+                taxon_id: 0,
+                functional_annotations: vec![],
+            }],
+        };
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0];
+        let searcher = Searcher::new(
+            sparse_sa,
+            1,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])
+        );
+    }
+
+    #[test]
+    fn test_il_suffix_check() {
+        let text = "IIIILL$".to_string().into_bytes();
+
+        let proteins = Proteins {
+            input_string: text,
+            proteins: vec![Protein {
+                uniprot_id: String::new(),
+                taxon_id: 0,
+                functional_annotations: vec![],
+            }],
+        };
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let sparse_sa = vec![6, 4, 2, 0];
+        let searcher = Searcher::new(
+            sparse_sa,
+            2,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        // search all places where II is in the string IIIILL, but with a sparse SA
+        // this way we check if filtering the suffixes works as expected
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, false);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![0, 1, 2])
+        );
+    }
+
+    #[test]
+    fn test_il_duplication2() {
+        let text = "IILLLL$".to_string().into_bytes();
+
+        let proteins = Proteins {
+            input_string: text,
+            proteins: vec![Protein {
+                uniprot_id: String::new(),
+                taxon_id: 0,
+                functional_annotations: vec![],
+            }],
+        };
+
+        let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap();
+        let taxonomy_file = create_taxonomy_file(&tmp_dir);
+
+        let sparse_sa = vec![6, 5, 4, 3, 2, 1, 0];
+        let searcher = Searcher::new(
+            sparse_sa,
+            1,
+            Box::new(SparseSuffixToProtein::new(&proteins.input_string)),
+            proteins,
+            TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(),
+            FunctionAggregator {}
+        );
+
+        // search bounds 'IM' with equal I and L
+        let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
+        assert_eq!(
+            found_suffixes,
+            SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])
+        );
+    }
+}
\ No newline at end of file
diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs
index 92fd523..900c531 100644
--- a/sa-mappings/src/proteins.rs
+++ b/sa-mappings/src/proteins.rs
@@ -40,7 +40,7 @@ pub struct Proteins {
     pub input_string: Vec<u8>,
 
     /// The proteins in the input string
-    proteins: Vec<Protein>
+    pub proteins: Vec<Protein>
 }
 
 impl Protein {