From 9bd80fe1ec49ad78c9f530fd104d2f34517774a5 Mon Sep 17 00:00:00 2001 From: Bram Devlaminck Date: Tue, 14 May 2024 10:57:26 +0200 Subject: [PATCH] move all suffix array code from personal repo to unipept repo --- .gitmodules | 3 + Cargo.lock | 720 +++++++++++++++++++++++- Cargo.toml | 8 +- libsais64-rs/Cargo.toml | 10 + libsais64-rs/builder.rs | 89 +++ libsais64-rs/libsais | 1 + libsais64-rs/libsais-wrapper.h | 4 + libsais64-rs/src/lib.rs | 37 ++ sa-builder/Cargo.toml | 4 + sa-builder/build_suffix_array.pbs | 41 ++ sa-builder/src/binary.rs | 134 +++++ sa-builder/src/lib.rs | 77 +++ sa-builder/src/main.rs | 39 +- sa-index/Cargo.toml | 7 + sa-index/src/lib.rs | 226 ++++++++ sa-index/src/main.rs | 9 +- sa-index/src/peptide_search.rs | 252 +++++++++ sa-index/src/sa_searcher.rs | 506 +++++++++++++++++ sa-index/src/suffix_to_protein_index.rs | 159 ++++++ sa-index/src/util.rs | 55 ++ sa-mappings/Cargo.toml | 3 + sa-mappings/src/functionality.rs | 104 ++-- sa-mappings/src/proteins.rs | 113 ++-- sa-mappings/src/taxonomy.rs | 59 +- sa-server/Cargo.toml | 7 + sa-server/src/main.rs | 185 +++++- 26 files changed, 2720 insertions(+), 132 deletions(-) create mode 100644 .gitmodules create mode 100644 libsais64-rs/Cargo.toml create mode 100644 libsais64-rs/builder.rs create mode 160000 libsais64-rs/libsais create mode 100644 libsais64-rs/libsais-wrapper.h create mode 100644 libsais64-rs/src/lib.rs create mode 100644 sa-builder/build_suffix_array.pbs create mode 100644 sa-builder/src/binary.rs create mode 100644 sa-builder/src/lib.rs create mode 100644 sa-index/src/lib.rs create mode 100644 sa-index/src/peptide_search.rs create mode 100644 sa-index/src/sa_searcher.rs create mode 100644 sa-index/src/suffix_to_protein_index.rs create mode 100644 sa-index/src/util.rs diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..2bd9ebc --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "libsais64-rs/libsais"] + path = libsais64-rs/libsais + url = https://github.com/IlyaGrebnov/libsais.git diff --git a/Cargo.lock b/Cargo.lock index 1c10d09..27c3eed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,12 +41,66 @@ dependencies = [ "winapi", ] +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + [[package]] name = "anstyle" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +[[package]] +name = "anstyle-parse" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + +[[package]] +name = "async-trait" +version = "0.1.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.58", +] + [[package]] name = "attohttpc" version = "0.15.0" @@ -54,7 +108,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe174d1b67f7b2bafed829c09db039301eb5841f66e43be2cf60b326e7f8e2cc" dependencies = [ "flate2", - "http", + "http 0.2.12", "log", "native-tls", "openssl", @@ -80,6 +134,74 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" +[[package]] +name = "axum" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +dependencies = [ + "async-trait", + "axum-core", + "axum-macros", + "bytes", + "futures-util", + "http 1.1.0", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper 1.0.1", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a15c63fd72d41492dc4f497196f5da1fb04fb7529e631d73630d1b491e47a2e3" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.1.0", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper 0.1.2", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00c055ee2d014ae5981ce1016374e8213682aa14d9bf40e48ab48b5f3ef20eaa" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 2.0.58", +] + [[package]] name = "backtrace" version = "0.3.71" @@ -95,6 +217,29 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bindgen" +version = "0.69.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +dependencies = [ + "bitflags 2.5.0", + "cexpr", + "clang-sys", + "itertools", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.58", + "which", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -147,6 +292,15 @@ version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -180,6 +334,17 @@ dependencies = [ "half", ] +[[package]] +name = "clang-sys" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "2.34.0" @@ -189,7 +354,7 @@ dependencies = [ "ansi_term", "atty", "bitflags 1.3.2", - "strsim", + "strsim 0.8.0", "textwrap", "unicode-width", "vec_map", @@ -202,6 +367,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] @@ -210,8 +376,22 @@ version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ + "anstream", "anstyle", "clap_lex", + "strsim 0.11.1", +] + +[[package]] +name = "clap_derive" +version = "4.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.58", ] [[package]] @@ -220,6 +400,21 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "colorchoice" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" + [[package]] name = "core-foundation" version = "0.9.4" @@ -346,7 +541,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -429,6 +624,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", +] + [[package]] name = "futures-core" version = "0.3.30" @@ -470,6 +674,12 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "half" version = "2.4.0" @@ -489,6 +699,18 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -504,6 +726,15 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "http" version = "0.2.12" @@ -515,6 +746,87 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +dependencies = [ + "bytes", + "futures-core", + "http 1.1.0", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", +] + +[[package]] +name = "hyper-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", +] + [[package]] name = "idna" version = "0.5.0" @@ -533,9 +845,15 @@ checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ "hermit-abi 0.3.9", "libc", - "windows-sys", + "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + [[package]] name = "itertools" version = "0.10.5" @@ -566,12 +884,44 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "libdivsufsort-rs" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae6d5de0472a582c6be43c392ba23e8ca59add069c8bd661f6b2a90690be3fb" +dependencies = [ + "cmake", +] + +[[package]] +name = "libloading" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" +dependencies = [ + "cfg-if", + "windows-targets 0.52.4", +] + +[[package]] +name = "libsais64-rs" +version = "0.1.0" +dependencies = [ + "bindgen", +] + [[package]] name = "linux-raw-sys" version = "0.4.13" @@ -584,6 +934,12 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "memchr" version = "2.7.2" @@ -600,6 +956,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.2" @@ -609,6 +977,17 @@ dependencies = [ "adler", ] +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + [[package]] name = "native-tls" version = "0.2.11" @@ -627,6 +1006,16 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-traits" version = "0.2.18" @@ -636,6 +1025,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.9", + "libc", +] + [[package]] name = "object" version = "0.32.2" @@ -716,6 +1115,26 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.58", +] + [[package]] name = "pin-project-lite" version = "0.2.14" @@ -768,6 +1187,16 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "prettyplease" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" +dependencies = [ + "proc-macro2", + "syn 2.0.58", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -941,6 +1370,12 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.38.32" @@ -951,9 +1386,15 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] +[[package]] +name = "rustversion" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "092474d1a01ea8278f69e6a358998405fae5b8b963ddaeb2b0b04a128bf1dfb0" + [[package]] name = "ryu" version = "1.0.17" @@ -963,10 +1404,25 @@ checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "sa-builder" version = "0.1.0" +dependencies = [ + "clap 4.5.4", + "libdivsufsort-rs", + "libsais64-rs", + "sa-mappings", +] [[package]] name = "sa-index" version = "0.1.0" +dependencies = [ + "clap 4.5.4", + "rayon", + "sa-builder", + "sa-mappings", + "serde", + "serde_json", + "umgap", +] [[package]] name = "sa-mappings" @@ -974,10 +1430,25 @@ version = "0.1.0" dependencies = [ "bytelines", "fa-compression", + "serde", + "serde_json", "tempdir", "umgap", ] +[[package]] +name = "sa-server" +version = "0.1.0" +dependencies = [ + "axum", + "clap 4.5.4", + "sa-builder", + "sa-index", + "sa-mappings", + "serde", + "tokio", +] + [[package]] name = "same-file" version = "1.0.6" @@ -993,7 +1464,7 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -1041,21 +1512,71 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" dependencies = [ "itoa", "ryu", "serde", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af99884400da37c88f5e9146b7f1fd0fbcae8f6eec4e9da38b67d05486f814a6" +dependencies = [ + "itoa", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "strsim" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "structopt" version = "0.3.26" @@ -1073,7 +1594,7 @@ version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro-error", "proc-macro2", "quote", @@ -1092,7 +1613,7 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6e163a520367c465f59e0a61a23cfae3b10b6546d78b6f672a382be79f7110" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro2", "quote", "syn 1.0.109", @@ -1120,6 +1641,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + [[package]] name = "tempdir" version = "0.3.7" @@ -1139,7 +1672,7 @@ dependencies = [ "cfg-if", "fastrand", "rustix", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -1184,7 +1717,72 @@ checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", + "libc", + "mio", + "num_cpus", "pin-project-lite", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.58", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "log", + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", ] [[package]] @@ -1252,6 +1850,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + [[package]] name = "vcpkg" version = "0.2.15" @@ -1350,6 +1954,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "winapi" version = "0.3.9" @@ -1381,13 +1997,37 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.4", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -1396,51 +2036,93 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.4" diff --git a/Cargo.toml b/Cargo.toml index 461b751..728b06e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,11 @@ [workspace] resolver = "2" -members = [ +members = [ "fa-compression", + "libsais64-rs", "sa-builder", "sa-index", - "sa-mappings" -, "sa-server"] + "sa-mappings", + "sa-server" +] diff --git a/libsais64-rs/Cargo.toml b/libsais64-rs/Cargo.toml new file mode 100644 index 0000000..36799c3 --- /dev/null +++ b/libsais64-rs/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "libsais64-rs" +version = "0.1.0" +edition = "2021" +build = "builder.rs" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[build-dependencies] +bindgen = "0.69.4" diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs new file mode 100644 index 0000000..d4e26b2 --- /dev/null +++ b/libsais64-rs/builder.rs @@ -0,0 +1,89 @@ +use std::env; +use std::error::Error; +use std::fmt::{Display, Formatter}; +use std::path::PathBuf; +use std::process::{Command, ExitStatus}; + +/// Custom error for compilation of the C library +#[derive(Debug)] +struct CompileError<'a> { + command: &'a str, + exit_code: Option, +} + +impl<'a> Display for CompileError<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let end_text = if let Some(code) = self.exit_code { + format!("with exit code {}", code) + } else { + "without exit code".to_string() + }; + let text = format!("Command with name `{}` failed {}", self.command, end_text); + write!(f, "{}", text) + } +} + +impl<'a> Error for CompileError<'a> {} + +/// Handles the exit statuses of the executed bash commands +/// +/// # Arguments +/// * `name` - Name of the executed bash command +/// * `exit_states` - The exit status of the executed bash command +/// +/// # Returns +/// +/// Returns () if the exit status was success +/// +/// # Errors +/// +/// Returns a CompilationError if the command failed +fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), CompileError> { + match exit_status.success() { + true => Ok(()), + false => Err(CompileError { + command: name, + exit_code: exit_status.code(), + }), + } +} + +fn main() -> Result<(), Box> { + // compile the c library + Command::new("rm") + .args(["libsais/CMakeCache.txt"]) + .status().unwrap_or_default(); // if removing fails, it is since the cmake cache did not exist, we just can ignore it + exit_status_to_result( + "cmake", + Command::new("cmake") + .args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"]) + .status()?, + )?; + exit_status_to_result( + "make", + Command::new("make").args(["-C", "libsais"]).status()?, + )?; + + // link the c libsais library to rust + println!("cargo:rustc-link-search=native=libsais64-rs/libsais"); + println!("cargo:rustc-link-lib=static=libsais"); + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // The input header we would like to generate + // bindings for. + .header("libsais-wrapper.h") + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) + // Finish the builder and generate the bindings. + .generate()?; + + // Write the bindings to the $OUT_DIR/bindings.rs file. + let out_path = PathBuf::from(env::var("OUT_DIR")?); + bindings.write_to_file(out_path.join("bindings.rs"))?; + + Ok(()) +} diff --git a/libsais64-rs/libsais b/libsais64-rs/libsais new file mode 160000 index 0000000..2e5f9c2 --- /dev/null +++ b/libsais64-rs/libsais @@ -0,0 +1 @@ +Subproject commit 2e5f9c22109847f42f8aeb7d4865e7ffa5dd0e11 diff --git a/libsais64-rs/libsais-wrapper.h b/libsais64-rs/libsais-wrapper.h new file mode 100644 index 0000000..fbfe0b9 --- /dev/null +++ b/libsais64-rs/libsais-wrapper.h @@ -0,0 +1,4 @@ +#include "libsais/include/libsais64.h" + + +int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); \ No newline at end of file diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs new file mode 100644 index 0000000..59b1959 --- /dev/null +++ b/libsais64-rs/src/lib.rs @@ -0,0 +1,37 @@ +// ignore errors because of different style in c code and import the c bindings +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + + +/// Builds the suffix array over the `text` using the libsais64 algorithm +/// +/// # Arguments +/// * `text` - The text used for suffix array construction +/// +/// # Returns +/// +/// Returns Some with the suffix array build over the text if construction succeeds +/// Returns None if construction of the suffix array failed +pub fn sais64(text: &[u8]) -> Option> { + let mut sa = vec![0; text.len()]; + let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) }; + if exit_code == 0 { + Some(sa) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use crate::{sais64}; + + #[test] + fn check_build_sa_with_libsais64() { + let text = "banana$"; + let sa = sais64(text.as_bytes()); + assert_eq!(sa, Some(vec![6, 5, 3, 1, 0, 4, 2])); + } +} diff --git a/sa-builder/Cargo.toml b/sa-builder/Cargo.toml index 31d44b6..9937384 100644 --- a/sa-builder/Cargo.toml +++ b/sa-builder/Cargo.toml @@ -6,3 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +clap = { version = "4.4.8", features = ["derive"] } +libsais64-rs = { path = "../libsais64-rs" } +libdivsufsort-rs = "0.1.0" +sa-mappings = { path = "../sa-mappings" } diff --git a/sa-builder/build_suffix_array.pbs b/sa-builder/build_suffix_array.pbs new file mode 100644 index 0000000..fac37e6 --- /dev/null +++ b/sa-builder/build_suffix_array.pbs @@ -0,0 +1,41 @@ +#!/bin/bash + +######################################################################################################### +### This script is designed to run on the Ghent university HPC ### +### ### +### how to use: ### +### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade` ### +### 2) Navigate the to root of the project ### +### 3) Submit the job to the queue with `qsub suffixarray/build_suffix_array.pbs` ### +######################################################################################################### + +# go to cluster with high memory +module swap cluster/gallade + +# define requested memory, cpu resources and email notifications +#PBS -m abe +#PBS -l walltime=10:00:00 +#PBS -l mem=750gb +# ask for 1 node, 1 cpu (not more needed since we don't have parallelism) +#PBS -l nodes=1:ppn=1 +#PBS -N suffix_array_construction_uniprot + +# define output and error files +#PBS -o stdout.$PBS_JOBID +#PBS -e stderr.$PBS_JOBID + +prefix="$VSC_DATA_VO/bram/" + +# load Rust +module load Rust/1.75.0-GCCcore-12.3.0 +module load Clang/16.0.6-GCCcore-12.3.0 # needed to build the bindings from Rust to C +module load CMake/3.26.3-GCCcore-12.3.0 + +# go to current working dir and execute +cd $PBS_O_WORKDIR + +# compile +cargo build --release + +# execute +./target/release/suffixarray_builder -d "$prefix"uniprot_protein_database_minimal.tsv -t "$prefix"taxons.tsv --sparseness-factor 3 --construction-algorithm lib-div-suf-sort -o "$prefix"uniprot_suffix_array_sparse3.bin diff --git a/sa-builder/src/binary.rs b/sa-builder/src/binary.rs new file mode 100644 index 0000000..78e7603 --- /dev/null +++ b/sa-builder/src/binary.rs @@ -0,0 +1,134 @@ +use std::cmp::min; +use std::error::Error; +use std::fs::{File, OpenOptions}; +use std::io::{Read, Write}; + +const ONE_GIB: usize = 2usize.pow(30); + +/// Trait implemented by structs that are binary serializable +/// In our case this is will be a [i64] since the suffix array is a Vec +pub trait Serializable { + + /// Serializes self into a vector of bytes + /// + /// # Returns + /// + /// Returns a vector of bytes + fn serialize(&self) -> Vec; +} + +impl Serializable for [i64] { + fn serialize(&self) -> Vec { + let mut res = vec![]; + self.iter().for_each(|entry| + res.extend_from_slice(&entry.to_le_bytes()) + ); + res + } +} + +/// Deserializes a vector of bytes into the suffix array +/// +/// # Arguments +/// * `data` - The raw bytes needed to be serialized into a suffix array +/// +/// # Returns +/// +/// Returns the suffix array, a Vec +fn deserialize_sa(data: &[u8]) -> Vec { + let mut res = vec![]; + if data.len() % 8 != 0 { + panic!("Serialized data is not a multiple of 8 bytes long!") + } + for start in (0..data.len()).step_by(8) { + res.push(i64::from_le_bytes(data[start..start + 8].try_into().unwrap())); + } + res +} + +/// Writes the given suffix array with the `sparseness_factor` factor to the given file +/// +/// # Arguments +/// * `sparseness_factor` - The sparseness factor of the suffix array +/// * `suffix_array` - The suffix array +/// * `filename` - The name of the file we want to write the suffix array to +/// +/// # Returns +/// +/// Returns () if writing away the suffix array succeeded +/// +/// # Errors +/// +/// Returns an io::Error if writing away the suffix array failed +pub fn write_suffix_array(sparseness_factor: u8, suffix_array: &[i64], filename: &str) -> Result<(), std::io::Error> { + // create the file + let mut f = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) // if the file already exists, empty the file + .open(filename)?; + f.write_all(&[sparseness_factor])?; // write the sample rate as the first byte + + // write 1 GiB at a time, to minimize extra used memory since we need to translate i64 to [u8; 8] + let sa_len = suffix_array.len(); + for start_index in (0..sa_len).step_by(ONE_GIB/8) { + let end_index = min(start_index + ONE_GIB/8, sa_len); + f.write_all(&suffix_array[start_index..end_index].serialize())?; + } + + Ok(()) +} + +/// Loads the suffix array from the file with the given `filename` +/// +/// # Arguments +/// * `filename` - The filename of the file where the suffix array is stored +/// +/// # Returns +/// +/// Returns the sample rate of the suffix array, together with the suffix array +/// +/// # Errors +/// +/// Returns any error from opening the file or reading the file +pub fn load_suffix_array(filename: &str) -> Result<(u8, Vec), Box> { + let mut file = &File::open(filename)?; + let mut sparseness_factor_buffer = [0_u8; 1]; + file.read_exact(&mut sparseness_factor_buffer).map_err(|_| "Could not read the sample rate from the binary file")?; + let sparseness_factor = sparseness_factor_buffer[0]; + + let mut sa = vec![]; + loop { + let mut buffer = vec![]; + // use take in combination with read_to_end to ensure that the buffer will be completely filled (except when the file is smaller than the buffer) + let count = file.take(ONE_GIB as u64).read_to_end(&mut buffer)?; + if count == 0 { + break; + } + sa.extend_from_slice(&deserialize_sa(&buffer[..count])); + } + + Ok((sparseness_factor, sa)) +} + + +#[cfg(test)] +mod tests { + use crate::binary::{deserialize_sa, Serializable}; + + #[test] + fn test_serialize_deserialize() { + let data: Vec = vec![5, 2165487362, -12315135]; + let serialized = data.serialize(); + let deserialized = deserialize_sa(serialized.as_ref()); + assert_eq!(data, deserialized); + } + + #[test] + fn test_serialize_deserialize_empty() { + let data: Vec = vec![]; + let serialized = data.serialize(); + let deserialized = deserialize_sa(serialized.as_ref()); + assert_eq!(data, deserialized); + } +} \ No newline at end of file diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs new file mode 100644 index 0000000..d009656 --- /dev/null +++ b/sa-builder/src/lib.rs @@ -0,0 +1,77 @@ +pub mod binary; + +use std::error::Error; +use clap::{Parser, ValueEnum}; + +/// Enum that represents all possible commandline arguments +#[derive(Parser, Debug)] +pub struct Arguments { + /// File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a `#`. + #[arg(short, long)] + pub database_file: String, + #[arg(short, long)] + /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. + pub taxonomy: String, + /// Output file to store the built index. + #[arg(short, long)] + pub output: String, + /// The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used) + #[arg(long, default_value_t = 1)] + pub sparseness_factor: u8, + #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] + pub construction_algorithm: SAConstructionAlgorithm, +} + +/// Enum representing the two possible algorithms to construct the suffix array +#[derive(ValueEnum, Clone, Debug, PartialEq)] +pub enum SAConstructionAlgorithm { + LibDivSufSort, + LibSais, +} + +/// Gets the current time in ms +/// +/// # Arguments +/// * `data` - The text on which we want to build the suffix array +/// * `construction_algorithm` - The algorithm used during construction +/// * `sparseness_factor` - The sparseness factor used on the suffix array +/// +/// # Returns +/// +/// Returns the constructed suffix array +/// +/// # Errors +/// +/// The errors that occurred during the building of the suffix array itself +pub fn build_sa(data: &mut Vec, construction_algorithm: &SAConstructionAlgorithm, sparseness_factor: u8) -> Result, Box> { + + // translate all L's to a I + for character in data.iter_mut() { + if *character == b'L' { + *character = b'I' + } + } + + let mut sa = match construction_algorithm { + SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(data), + SAConstructionAlgorithm::LibDivSufSort => { + libdivsufsort_rs::divsufsort64(data) + } + }.ok_or("Building suffix array failed")?; + + // make the SA sparse and decrease the vector size if we have sampling (== sampling_rate > 1) + if sparseness_factor > 1 { + let mut current_sampled_index = 0; + for i in 0..sa.len() { + let current_sa_val = sa[i]; + if current_sa_val % sparseness_factor as i64 == 0 { + sa[current_sampled_index] = current_sa_val; + current_sampled_index += 1; + } + } + // make shorter + sa.resize(current_sampled_index, 0); + } + + Ok(sa) +} \ No newline at end of file diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index e7a11a9..d738540 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,3 +1,38 @@ +use clap::Parser; +use sa_mappings::proteins::Proteins; +use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator}; +use sa_builder::{Arguments, build_sa}; +use sa_builder::binary::write_suffix_array; + fn main() { - println!("Hello, world!"); -} + let args = Arguments::parse(); + let Arguments { database_file, taxonomy, output, sparseness_factor, construction_algorithm } = args; + let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar); + if let Err(err) = taxon_id_calculator { + eprintln!("{}", err); + std::process::exit(1); + } + + let taxon_id_calculator = taxon_id_calculator.unwrap(); + + // read input + let data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator); + if let Err(err) = data { + eprintln!("{}", err); + std::process::exit(1); + } + let mut data = data.unwrap(); + // calculate sa + let sa = build_sa(&mut data, &construction_algorithm, sparseness_factor); + if let Err(err) = sa { + eprintln!("{}", err); + std::process::exit(1); + } + let sa = sa.unwrap(); + + // output the build SA + if let Err(err) = write_suffix_array(sparseness_factor, &sa, &output) { + eprintln!("{}", err); + std::process::exit(1); + }; +} \ No newline at end of file diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index 3470891..c355bef 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -6,3 +6,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +clap = { version = "4.4.8", features = ["derive"] } +umgap = "1.1.0" +rayon = "1.8.1" +serde = { version = "1.0.197", features = ["derive"] } +sa-builder = { path = "../sa-builder" } +sa-mappings = { path = "../sa-mappings" } +serde_json = "1.0.116" diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs new file mode 100644 index 0000000..90753a6 --- /dev/null +++ b/sa-index/src/lib.rs @@ -0,0 +1,226 @@ +use std::error::Error; +use std::num::NonZeroUsize; + +use clap::{arg, Parser, ValueEnum}; + +use sa_mappings::functionality::FunctionAggregator; +use sa_mappings::proteins::Proteins; +use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator}; +use sa_builder::{build_sa, SAConstructionAlgorithm}; +use sa_builder::binary::{load_suffix_array, write_suffix_array}; + +use crate::peptide_search::{analyse_all_peptides, search_all_peptides}; +use crate::sa_searcher::Searcher; +use crate::suffix_to_protein_index::{ + DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle, +}; +use crate::util::{get_time_ms, read_lines}; + +pub mod peptide_search; +pub mod sa_searcher; +pub mod suffix_to_protein_index; +pub mod util; + +/// Enum that represents the 2 kinds of search that are supported +#[derive(ValueEnum, Clone, Debug, PartialEq)] +pub enum SearchMode { + Search, + Analysis, +} + +/// Enum that represents all possible commandline arguments +#[derive(Parser, Debug)] +pub struct Arguments { + /// File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a `#`. + #[arg(short, long)] + database_file: String, + #[arg(short, long)] + search_file: Option, + #[arg(short, long)] + /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. + taxonomy: String, + /// This will only build the tree and stop after that is completed. Used during benchmarking. + #[arg(long)] + build_only: bool, + /// Output file to store the built index. + #[arg(short, long)] + output: Option, + /// The sparseness factor used on the suffix array (default value 1, which means every value in the SA is used) + #[arg(long, default_value_t = 1)] + sparseness_factor: u8, + /// Set the style used to map back from the suffix to the protein. 2 options or . Dense is default + /// Dense uses O(n) memory with n the size of the input text, and takes O(1) time to find the mapping + /// Sparse uses O(m) memory with m the number of proteins, and takes O(log m) to find the mapping + #[arg(long, value_enum, default_value_t = SuffixToProteinMappingStyle::Sparse)] + suffix_to_protein_mapping: SuffixToProteinMappingStyle, + #[arg(long)] + load_index: Option, + #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] + construction_algorithm: SAConstructionAlgorithm, + /// Assume the resulting taxon ID is root (1) whenever a peptide matches >= cutoff proteins + #[arg(long, default_value_t = 10000)] + cutoff: usize, + #[arg(long)] + threads: Option, + #[arg(long)] + equalize_i_and_l: bool, + #[arg(long)] + clean_taxa: bool, + #[arg(long, value_enum, default_value_t = SearchMode::Analysis)] + search_mode: SearchMode +} + + +/// Run the suffix array program +/// +/// # Arguments +/// * `args` - The commandline arguments provided to the program +/// +/// # Returns +/// +/// Unit +/// +/// # Errors +/// +/// Returns all possible errors that occurred during the program +pub fn run(mut args: Arguments) -> Result<(), Box> { + let taxon_id_calculator = + TaxonAggregator::try_from_taxonomy_file(&args.taxonomy, AggregationMethod::LcaStar)?; + + let sa = match &args.load_index { + // load SA from file + Some(index_file_name) => { + let (sparseness_factor, sa) = load_suffix_array(index_file_name)?; + args.sparseness_factor = sparseness_factor; + // println!("Loading the SA took {} ms and loading the proteins + SA took {} ms", end_loading_ms - start_loading_ms, end_loading_ms - start_reading_proteins_ms); + // TODO: some kind of security check that the loaded database file and SA match + sa + } + // build the SA + None => { + let protein_sequences = + Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?; + build_sa( + &mut protein_sequences.input_string.clone(), + &args.construction_algorithm, + args.sparseness_factor, + )? + } + }; + + let proteins = Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?; + + if let Some(output) = &args.output { + write_suffix_array(args.sparseness_factor, &sa, output)?; + } + + // option that only builds the tree, but does not allow for querying (easy for benchmark purposes) + if args.build_only { + return Ok(()); + } + + // build the right mapping index, use box to be able to store both types in this variable + let suffix_index_to_protein: Box = + match args.suffix_to_protein_mapping { + SuffixToProteinMappingStyle::Dense => { + Box::new(DenseSuffixToProtein::new(&proteins.input_string)) + } + SuffixToProteinMappingStyle::Sparse => { + Box::new(SparseSuffixToProtein::new(&proteins.input_string)) + } + }; + + let functional_aggregator = FunctionAggregator {}; + + let searcher = Searcher::new( + sa, + args.sparseness_factor, + suffix_index_to_protein, + proteins, + taxon_id_calculator, + functional_aggregator, + ); + + execute_search(&searcher, &args)?; + Ok(()) +} + +/// Execute the search using the provided programs +/// +/// # Arguments +/// * `searcher` - The Searcher which contains the protein database +/// * `args` - The arguments used to start the program +/// +/// # Returns +/// +/// Unit +/// +/// # Errors +/// +/// Returns possible errors that occurred during search +fn execute_search(searcher: &Searcher, args: &Arguments) -> Result<(), Box> { + let cutoff = args.cutoff; + let search_file = args + .search_file + .as_ref() + .ok_or("No peptide file provided to search in the database")?; + + let start_time = get_time_ms()?; + let lines = read_lines(search_file)?; + let all_peptides: Vec = lines.map_while(Result::ok).collect(); + + // Explicitly set the number of threads to use if the commandline argument was set + if let Some(threads) = args.threads { + rayon::ThreadPoolBuilder::new() + .num_threads(threads.get()) + .build_global()?; + } + + match args.search_mode { + SearchMode::Search => { + let search_result = search_all_peptides( + searcher, + &all_peptides, + cutoff, + args.equalize_i_and_l, + args.clean_taxa, + ); + println!("{}", serde_json::to_string(&search_result)?); + } + SearchMode::Analysis => { + let search_result = analyse_all_peptides( + searcher, + &all_peptides, + cutoff, + args.equalize_i_and_l, + args.clean_taxa, + ); + println!("{}", serde_json::to_string(&search_result)?); + } + } + + let end_time = get_time_ms()?; + + // output to other channel to prevent integrating it into the actual output + eprintln!( + "Spend {} ms to search the whole file", + end_time - start_time + ); + + Ok(()) +} + +/// Custom trait implemented by types that have a value that represents NULL +pub trait Nullable { + const NULL: T; + + fn is_null(&self) -> bool; +} + +impl Nullable for u32 { + const NULL: u32 = u32::MAX; + + fn is_null(&self) -> bool { + *self == Self::NULL + } +} diff --git a/sa-index/src/main.rs b/sa-index/src/main.rs index e7a11a9..cd56323 100644 --- a/sa-index/src/main.rs +++ b/sa-index/src/main.rs @@ -1,3 +1,10 @@ +use clap::Parser; +use sa_index::{Arguments, run}; + fn main() { - println!("Hello, world!"); + let args = Arguments::parse(); + if let Err(error) = run(args) { + eprintln!("{}", error); + std::process::exit(1); + }; } diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs new file mode 100644 index 0000000..d1277c0 --- /dev/null +++ b/sa-index/src/peptide_search.rs @@ -0,0 +1,252 @@ +use crate::sa_searcher::{SearchAllSuffixesResult, Searcher}; +use rayon::prelude::*; +use sa_mappings::functionality::FunctionalAggregation; +use sa_mappings::proteins::Protein; +use serde::Serialize; + +/// Struct representing a collection of `SearchResultWithAnalysis` or `SearchOnlyResult` results +#[derive(Debug, Serialize)] +pub struct OutputData { + result: Vec, +} + +/// Struct representing the search result of the `sequence` in the index, including the analyses +#[derive(Debug, Serialize)] +pub struct SearchResultWithAnalysis { + sequence: String, + lca: Option, + taxa: Vec, + uniprot_accession_numbers: Vec, + fa: Option, + cutoff_used: bool, +} + +/// Struct representing the search result of the `sequence` in the index (without the analyses) +#[derive(Debug, Serialize)] +pub struct SearchOnlyResult { + sequence: String, + proteins: Vec, + cutoff_used: bool, +} + +/// Struct that represents all information known about a certain protein in our database +#[derive(Debug, Serialize)] +pub struct ProteinInfo { + taxon: usize, + uniprot_accession: String, + functional_annotations: Vec, +} + +/// Searches the `peptide` in the index multithreaded and retrieves the matching proteins +/// +/// # Arguments +/// * `searcher` - The Searcher which contains the protein database +/// * `peptide` - The peptide that is being searched in the index +/// * `cutoff` - The maximum amount of matches we want to process from the index +/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search +/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy +/// +/// # Returns +/// +/// Returns Some if matches are found. +/// The first argument is true if the cutoff is used, otherwise false +/// The second argument is a list of all matching proteins for the peptide +/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the sparseness factor k used in the index +pub fn search_proteins_for_peptide<'a>( + searcher: &'a Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, +) -> Option<(bool, Vec<&'a Protein>)> { + let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); + + // words that are shorter than the sample rate are not searchable + if peptide.len() < searcher.sparseness_factor as usize { + return None; + } + + let suffix_search = + searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equalize_i_and_l); + let mut cutoff_used = false; + let suffixes = match suffix_search { + SearchAllSuffixesResult::MaxMatches(matched_suffixes) => { + cutoff_used = true; + matched_suffixes + } + SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, + SearchAllSuffixesResult::NoMatches => { + return None; + } + }; + + let mut proteins = searcher.retrieve_proteins(&suffixes); + if clean_taxa { + proteins.retain(|protein| searcher.taxon_valid(protein)) + } + + Some((cutoff_used, proteins)) +} + + +/// Searches the `peptide` in the index multithreaded and retrieves the protein information from the database +/// This does NOT perform any of the analyses, it only retrieves the functional and taxonomic annotations +/// +/// # Arguments +/// * `searcher` - The Searcher which contains the protein database +/// * `peptide` - The peptide that is being searched in the index +/// * `cutoff` - The maximum amount of matches we want to process from the index +/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search +/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy +/// +/// # Returns +/// +/// Returns Some(SearchOnlyResult) if the peptide has matches +/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the sparseness factor k used in the index +pub fn search_peptide_retrieve_annotations( + searcher: &Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, +) -> Option { + let (cutoff_used, proteins) = + search_proteins_for_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)?; + + let annotations = searcher.get_all_functional_annotations(&proteins); + + let mut protein_info: Vec = vec![]; + for (&protein, annotations) in proteins.iter().zip(annotations) { + protein_info.push(ProteinInfo { + taxon: protein.taxon_id, + uniprot_accession: protein.uniprot_id.clone(), + functional_annotations: annotations, + }) + } + + Some(SearchOnlyResult { + sequence: peptide.to_string(), + proteins: protein_info, + cutoff_used, + }) +} + + +/// Searches the `peptide` in the index multithreaded and performs the taxonomic and functional analyses +/// +/// # Arguments +/// * `searcher` - The Searcher which contains the protein database +/// * `peptide` - The peptide that is being searched in the index +/// * `cutoff` - The maximum amount of matches we want to process from the index +/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search +/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy +/// +/// # Returns +/// +/// Returns Some(SearchResultWithAnalysis) if the peptide has matches +/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the sparseness factor k used in the index +pub fn analyse_peptide( + searcher: &Searcher, + peptide: &str, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, +) -> Option { + let (cutoff_used, mut proteins) = + search_proteins_for_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)?; + + if clean_taxa { + proteins.retain(|protein| searcher.taxon_valid(protein)) + } + + // calculate the lca + let lca = if cutoff_used { + Some(1) + } else { + searcher.retrieve_lca(&proteins) + }; + + // return None if the LCA is none + lca?; + + let mut uniprot_accession_numbers = vec![]; + let mut taxa = vec![]; + + for protein in &proteins { + taxa.push(protein.taxon_id); + uniprot_accession_numbers.push(protein.uniprot_id.clone()); + } + + let fa = searcher.retrieve_function(&proteins); + // output the result + Some(SearchResultWithAnalysis { + sequence: peptide.to_string(), + lca, + cutoff_used, + uniprot_accession_numbers, + taxa, + fa, + }) +} + + +/// Searches the list of `peptides` in the index multithreaded and performs the functional and taxonomic analyses +/// +/// # Arguments +/// * `searcher` - The Searcher which contains the protein database +/// * `peptides` - List of peptides we want to search in the index +/// * `cutoff` - The maximum amount of matches we want to process from the index +/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search +/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy +/// +/// # Returns +/// +/// Returns an `OutputData` object with the search and analyses results for the peptides +pub fn analyse_all_peptides( + searcher: &Searcher, + peptides: &Vec, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, +) -> OutputData { + let res: Vec = peptides + .par_iter() + // calculate the results + .map(|peptide| analyse_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)) + // remove the None's + .filter_map(|search_result| search_result) + .collect(); + + OutputData { result: res } +} + +/// Searches the list of `peptides` in the index and retrieves all related information about the found proteins +/// This does NOT perform any of the analyses +/// +/// # Arguments +/// * `searcher` - The Searcher which contains the protein database +/// * `peptides` - List of peptides we want to search in the index +/// * `cutoff` - The maximum amount of matches we want to process from the index +/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search +/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy +/// +/// # Returns +/// +/// Returns an `OutputData` object with the search results for the peptides +pub fn search_all_peptides( + searcher: &Searcher, + peptides: &Vec, + cutoff: usize, + equalize_i_and_l: bool, + clean_taxa: bool, +) -> OutputData { + let res: Vec = peptides + .par_iter() + // calculate the results + .map(|peptide| search_peptide_retrieve_annotations(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)) + // remove None's + .filter_map(|search_result| search_result) + .collect(); + + OutputData { result: res } +} diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs new file mode 100644 index 0000000..6e7b44b --- /dev/null +++ b/sa-index/src/sa_searcher.rs @@ -0,0 +1,506 @@ +use std::cmp::min; + + +use sa_mappings::functionality::{FunctionAggregator, FunctionalAggregation}; +use sa_mappings::proteins::{Protein, Proteins}; +use sa_mappings::taxonomy::TaxonAggregator; +use umgap::taxon::TaxonId; + +use crate::sa_searcher::BoundSearch::{Maximum, Minimum}; +use crate::suffix_to_protein_index::SuffixToProteinIndex; +use crate::Nullable; + +/// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array +#[derive(Clone, Copy, PartialEq)] +enum BoundSearch { + Minimum, + Maximum, +} + +/// Enum representing the minimum and maximum bound of the found matches in the suffix array +#[derive(PartialEq, Debug)] +pub enum BoundSearchResult { + NoMatches, + SearchResult((usize, usize)), +} + +/// Enum representing the matching suffixes after searching a peptide in the suffix array +/// Both the MaxMatches and SearchResult indicate found suffixes, but MaxMatches is used when the cutoff is reached. +#[derive(Debug)] +pub enum SearchAllSuffixesResult { + NoMatches, + MaxMatches(Vec), + SearchResult(Vec), +} + +/// Custom implementation of partialEq for SearchAllSuffixesResult +/// We consider 2 SearchAllSuffixesResult equal if they exist of the same key, and the Vec contains the same values, but the order can be different +impl PartialEq for SearchAllSuffixesResult { + fn eq(&self, other: &Self) -> bool { + + /// Returns true if `arr1` and `arr2` contains the same elements, the order of the elements is ignored + /// + /// # Arguments + /// * `arr1` - The first array used in the comparison + /// * `arr2` - The second array used in the comparison + /// + /// # Returns + /// + /// Returns true if arr1 and arr2 contains the same elements, the order of the elements is ignored + fn array_eq_unordered(arr1: &[i64], arr2: &[i64]) -> bool { + let mut arr1_copy = arr1.to_owned(); + let mut arr2_copy = arr2.to_owned(); + + arr1_copy.sort(); + arr2_copy.sort(); + + arr1_copy == arr2_copy + } + + match (self, other) { + ( + SearchAllSuffixesResult::MaxMatches(arr1), + SearchAllSuffixesResult::MaxMatches(arr2), + ) => array_eq_unordered(arr1, arr2), + ( + SearchAllSuffixesResult::SearchResult(arr1), + SearchAllSuffixesResult::SearchResult(arr2), + ) => array_eq_unordered(arr1, arr2), + (SearchAllSuffixesResult::NoMatches, SearchAllSuffixesResult::NoMatches) => true, + _ => false, + } + } +} + +/// Struct that contains all the elements needed to search a peptide in the suffix array +/// This struct also contains all the functions used for search +/// +/// # Arguments +/// * `sa` - The sparse suffix array representing the protein database +/// * `sparseness_factor` - The sparseness factor used by the suffix array +/// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix is part of +/// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the taxonomic analysis provided by Unipept +/// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate the functional analysis provided by Unipept +pub struct Searcher { + sa: Vec, + pub sparseness_factor: u8, + suffix_index_to_protein: Box, + proteins: Proteins, + taxon_id_calculator: TaxonAggregator, + function_aggregator: FunctionAggregator +} + +impl Searcher { + + /// Creates a new Searcher object + /// + /// # Arguments + /// * `sa` - The sparse suffix array representing the protein database + /// * `sparseness_factor` - The sparseness factor used by the suffix array + /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix is part of + /// * `proteins` - List of all the proteins where the suffix array is build on + /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the taxonomic analysis provided by Unipept + /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate the functional analysis provided by Unipept + /// + /// # Returns + /// + /// Returns a new Searcher object + pub fn new( + sa: Vec, + sparseness_factor: u8, + suffix_index_to_protein: Box, + proteins: Proteins, + taxon_id_calculator: TaxonAggregator, + function_aggregator: FunctionAggregator + ) -> Self { + Self { + sa, + sparseness_factor, + suffix_index_to_protein, + proteins, + taxon_id_calculator, + function_aggregator + } + } + + /// Compares the `search_string` to the `suffix` + /// During search this function performs extra logic since the suffix array is build with I == L, while ` self.proteins.input_string` is the original text where I != L + /// + /// # Arguments + /// * `search_string` - The string/peptide being searched in the suffix array + /// * `suffix` - The current suffix from the suffix array we are comparing with in the binary search + /// * `skip` - How many characters we can skip in the comparison because we already know these match + /// * `bound` - Indicates if we are searching for the min of max bound + /// + /// # Returns + /// + /// The first argument is true if `bound` == `Minimum` and `search_string` <= `suffix` or if `bound` == `Maximum` and `search_string` >= `suffix` + /// The second argument indicates how far the `suffix` and `search_string` matched + fn compare( + &self, + search_string: &[u8], + suffix: i64, + skip: usize, + bound: BoundSearch, + ) -> (bool, usize) { + let mut index_in_suffix = (suffix as usize) + skip; + let mut index_in_search_string = skip; + let mut is_cond_or_equal = false; + + // Depending on if we are searching for the min of max bound our condition is different + let condition_check = match bound { + Minimum => |a: u8, b: u8| a < b, + Maximum => |a: u8, b: u8| a > b, + }; + + // match as long as possible + while index_in_search_string < search_string.len() + && index_in_suffix < self.proteins.input_string.len() + && (search_string[index_in_search_string] + == self.proteins.input_string[index_in_suffix] + || (search_string[index_in_search_string] == b'L' + && self.proteins.input_string[index_in_suffix] == b'I') + || (search_string[index_in_search_string] == b'I' + && self.proteins.input_string[index_in_suffix] == b'L')) + { + index_in_suffix += 1; + index_in_search_string += 1; + } + // check if match found OR current search string is smaller lexicographically (and the empty search string should not be found) + if !search_string.is_empty() { + if index_in_search_string == search_string.len() { + is_cond_or_equal = true + } else if index_in_suffix < self.proteins.input_string.len() { + // in our index every L was replaced by a I, so we need to replace them if we want to search in the right direction + let peptide_char = if search_string[index_in_search_string] == b'L' { + b'I' + } else { + search_string[index_in_search_string] + }; + + let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' { + b'I' + } else { + self.proteins.input_string[index_in_suffix] + }; + + is_cond_or_equal = condition_check(peptide_char, protein_char); + } + } + + (is_cond_or_equal, index_in_search_string) + } + + /// Searches for the minimum or maximum bound for a string in the suffix array + /// + /// # Arguments + /// * `bound` - Indicates if we are searching the minimum or maximum bound + /// * `search_string` - The string/peptide we are searching in the suffix array + /// + /// # Returns + /// + /// The first argument is true if a match was found + /// The second argument indicates the index of the minimum or maximum bound for the match (depending on `bound`) + fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8]) -> (bool, usize) { + let mut left: usize = 0; + let mut right: usize = self.sa.len(); + let mut lcp_left: usize = 0; + let mut lcp_right: usize = 0; + let mut found = false; + + // repeat until search window is minimum size OR we matched the whole search string last iteration + while right - left > 1 { + let center = (left + right) / 2; + let skip = min(lcp_left, lcp_right); + let (retval, lcp_center) = self.compare(search_string, self.sa[center], skip, bound); + + found |= lcp_center == search_string.len(); + + // update the left and right bound, depending on if we are searching the min or max bound + if retval && bound == Minimum || !retval && bound == Maximum { + right = center; + lcp_right = lcp_center; + } else { + left = center; + lcp_left = lcp_center; + } + } + + // handle edge case to search at index 0 + if right == 1 && left == 0 { + let (retval, lcp_center) = + self.compare(search_string, self.sa[0], min(lcp_left, lcp_right), bound); + + found |= lcp_center == search_string.len(); + + if bound == Minimum && retval { + right = 0; + } + } + + match bound { + Minimum => (found, right), + Maximum => (found, left), + } + } + + /// Searches for the minimum and maximum bound for a string in the suffix array + /// + /// # Arguments + /// * `search_string` - The string/peptide we are searching in the suffix array + /// + /// # Returns + /// + /// Returns the minimum and maximum bound of all matches in the suffix array, or `NoMatches` if no matches were found + pub fn search_bounds(&self, search_string: &[u8]) -> BoundSearchResult { + let (found_min, min_bound) = self.binary_search_bound(Minimum, search_string); + + if !found_min { + return BoundSearchResult::NoMatches; + } + + let (_, max_bound) = self.binary_search_bound(Maximum, search_string); + + BoundSearchResult::SearchResult((min_bound, max_bound + 1)) + } + + /// Searches for the suffixes matching a search string + /// During search I and L can be equated + /// + /// # Arguments + /// * `search_string` - The string/peptide we are searching in the suffix array + /// * `max_matches` - The maximum amount of matches processed, if more matches are found we don't process them + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns all the matching suffixes + #[inline] + pub fn search_matching_suffixes( + &self, + search_string: &[u8], + max_matches: usize, + equalize_i_and_l: bool, + ) -> SearchAllSuffixesResult { + let mut matching_suffixes: Vec = vec![]; + let mut il_locations = vec![]; + for (i, &character) in search_string.iter().enumerate() { + if character == b'I' || character == b'L' { + il_locations.push(i); + } + } + + let mut skip: usize = 0; + while skip < self.sparseness_factor as usize { + let mut il_locations_start = 0; + while il_locations_start < il_locations.len() && il_locations[il_locations_start] < skip { + il_locations_start += 1; + } + let il_locations_current_suffix = &il_locations[il_locations_start..]; + let current_search_string_prefix = &search_string[..skip]; + let current_search_string_suffix = &search_string[skip..]; + let search_bound_result = self.search_bounds(&search_string[skip..]); + // if the shorter part is matched, see if what goes before the matched suffix matches the unmatched part of the prefix + if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result { + // try all the partially matched suffixes and store the matching suffixes in an array (stop when our max number of matches is reached) + let mut sa_index = min_bound; + while sa_index < max_bound { + let suffix = self.sa[sa_index] as usize; + // filter away matches where I was wrongfully equalized to L, and check the unmatched prefix + // when I and L equalized, we only need to check the prefix, not the whole match, when the prefix is 0, we don't need to check at all + if suffix >= skip + && ((skip == 0 + || Self::check_prefix( + current_search_string_prefix, + &self.proteins.input_string[suffix - skip..suffix], + equalize_i_and_l, + )) + && Self::check_suffix( + skip, + il_locations_current_suffix, + current_search_string_suffix, + &self.proteins.input_string + [suffix..suffix + search_string.len() - skip], + equalize_i_and_l, + )) + { + matching_suffixes.push((suffix - skip) as i64); + + // return if max number of matches is reached + if matching_suffixes.len() >= max_matches { + return SearchAllSuffixesResult::MaxMatches(matching_suffixes); + } + } + sa_index += 1; + } + } + skip += 1; + } + + if matching_suffixes.is_empty() { + SearchAllSuffixesResult::NoMatches + } else { + SearchAllSuffixesResult::SearchResult(matching_suffixes) + } + } + + /// Returns true of the prefixes are the same + /// if `equalize_i_and_l` is set to true, L and I are considered the same + /// + /// # Arguments + /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched + /// * `index_prefix` - The unchecked prefix from the protein from the suffix array + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise false + #[inline] + fn check_prefix( + search_string_prefix: &[u8], + index_prefix: &[u8], + equalize_i_and_l: bool, + ) -> bool { + if equalize_i_and_l { + search_string_prefix.iter().zip(index_prefix).all( + |(&search_character, &index_character)| { + search_character == index_character + || (search_character == b'I' && index_character == b'L') + || (search_character == b'L' && index_character == b'I') + }, + ) + } else { + search_string_prefix == index_prefix + } + } + + /// Returns true of the search_string and index_string are equal + /// This is automatically true if `equalize_i_and_l` is set to true, since there matched during search where I = L + /// If `equalize_i_and_l` is set to false, we need to check if the I and L locations have the same character + /// + /// # Arguments + /// * `skip` - The used skip factor during the search iteration + /// * `il_locations` - The locations of the I's and L's in the **original** peptide + /// * `search_string` - The peptide that is being searched, but already with the skipped prefix removed from it + /// * `index_string` - The suffix that search_string matches with when I and L were equalized during search + /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// + /// # Returns + /// + /// Returns true if `search_string` and `index_string` are considered the same, otherwise false + fn check_suffix( + skip: usize, + il_locations: &[usize], + search_string: &[u8], + index_string: &[u8], + equalize_i_and_l: bool, + ) -> bool { + if equalize_i_and_l { + true + } else { + for &il_location in il_locations { + let index = il_location - skip; + if search_string[index] != index_string[index] { + return false; + } + } + true + } + } + + /// Returns all the proteins that correspond with the provided suffixes + /// + /// # Arguments + /// * `suffixes` - List of suffix indices + /// + /// # Returns + /// + /// Returns the proteins that every suffix is a part of + #[inline] + pub fn retrieve_proteins(&self, suffixes: &Vec) -> Vec<&Protein> { + let mut res = vec![]; + for &suffix in suffixes { + let protein_index = self.suffix_index_to_protein.suffix_to_protein(suffix); + if !protein_index.is_null() { + res.push(&self.proteins[protein_index as usize]); + } + } + res + } + + /// Searches all the matching proteins for a search_string/peptide in the suffix array + /// + /// # Arguments + /// * `search_string` - The string/peptide being searched + /// * `equalize_i_and_l` - If set to true, I and L are equalized during search + /// + /// # Returns + /// + /// Returns the matching proteins for the search_string + pub fn search_proteins_for_peptide(&self, search_string: &[u8], equalize_i_and_l: bool) -> Vec<&Protein> { + let mut matching_suffixes = vec![]; + if let SearchAllSuffixesResult::SearchResult(suffixes) = + self.search_matching_suffixes(search_string, usize::MAX, equalize_i_and_l) + { + matching_suffixes = suffixes; + } + self.retrieve_proteins(&matching_suffixes) + } + + /// Retrieves the taxonomic analysis for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns the taxonomic analysis result for the given list of proteins + #[inline] + pub fn retrieve_lca(&self, proteins: &[&Protein]) -> Option { + let taxon_ids: Vec = proteins.iter().map(|prot| prot.taxon_id).collect(); + + self.taxon_id_calculator + .aggregate(taxon_ids) + .map(|id| self.taxon_id_calculator + .snap_taxon(id) + ) + } + + /// Returns true if the protein is considered valid by the provided taxonomy file + /// + /// # Arguments + /// * `protein` - A protein of which we want to check the validity + /// + /// # Returns + /// + /// Returns true if the protein is considered valid by the provided taxonomy file + pub fn taxon_valid(&self, protein: &Protein) -> bool { + self.taxon_id_calculator.taxon_valid(protein.taxon_id) + } + + /// Retrieves the functional analysis for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns the functional analysis result for the given list of proteins + pub fn retrieve_function(&self, proteins: &[&Protein]) -> Option { + let res = self.function_aggregator.aggregate(proteins.to_vec()); + Some(res) + } + + /// Retrieves the all the functional annotations for a collection of proteins + /// + /// # Arguments + /// * `proteins` - A collection of proteins + /// + /// # Returns + /// + /// Returns all functional annotations for a collection of proteins + pub fn get_all_functional_annotations(&self, proteins: &[&Protein]) -> Vec> { + self.function_aggregator.get_all_functional_annotations(proteins) + } + +} diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs new file mode 100644 index 0000000..395a51c --- /dev/null +++ b/sa-index/src/suffix_to_protein_index.rs @@ -0,0 +1,159 @@ +use clap::ValueEnum; +use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; +use crate::Nullable; + +/// Enum used to define the commandline arguments and choose which index style is used +#[derive(ValueEnum, Clone, Debug, PartialEq)] +pub enum SuffixToProteinMappingStyle { + Dense, + Sparse +} + +/// Trait implemented by the SuffixToProtein mappings +pub trait SuffixToProteinIndex: Send + Sync { + + /// Returns the index of the protein in the protein list for the given suffix + /// + /// # Arguments + /// * `suffix` - The suffix of which we want to know of which protein it is a part + /// + /// # Returns + /// + /// Returns the index of the protein in the proteins list of which the suffix is a part + fn suffix_to_protein(&self, suffix: i64) -> u32; +} + +/// Mapping that uses O(n) memory with n the size of the input text, but retrieval of the protein is in O(1) +#[derive(Debug, PartialEq)] +pub struct DenseSuffixToProtein { + // UniProtKB does not have more that u32::MAX proteins, so a larger type is not needed + mapping: Vec, +} + +/// Mapping that uses O(m) memory with m the number of proteins, but retrieval of the protein is O(log m) +#[derive(Debug, PartialEq)] +pub struct SparseSuffixToProtein { + mapping: Vec, +} + +impl SuffixToProteinIndex for DenseSuffixToProtein { + fn suffix_to_protein(&self, suffix: i64) -> u32 { + self.mapping[suffix as usize] + } +} + +impl SuffixToProteinIndex for SparseSuffixToProtein { + fn suffix_to_protein(&self, suffix: i64) -> u32 { + let protein_index = self.mapping.binary_search(&suffix).unwrap_or_else(|index| index - 1); + // if the next value in the mapping is 1 larger than the current suffix, that means that the current suffix starts with a SEPARATION_CHARACTER or TERMINATION_CHARACTER + // this means it does not belong to a protein + if self.mapping[protein_index + 1] == suffix + 1 { + return u32::NULL + } + protein_index as u32 + } +} + +impl DenseSuffixToProtein { + + /// Creates a new DenseSuffixToProtein mapping + /// + /// # Arguments + /// * `text` - The text over which we want to create the mapping + /// + /// # Returns + /// + /// Returns a new DenseSuffixToProtein build over the provided text + pub fn new(text: &[u8]) -> Self { + let mut current_protein_index: u32 = 0; + let mut suffix_index_to_protein: Vec = vec![]; + for &char in text.iter() { + if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { + current_protein_index += 1; + suffix_index_to_protein.push(u32::NULL); + } else { + assert_ne!(current_protein_index, u32::NULL); + suffix_index_to_protein.push(current_protein_index); + } + } + suffix_index_to_protein.shrink_to_fit(); + DenseSuffixToProtein { mapping: suffix_index_to_protein } + } +} + +impl SparseSuffixToProtein { + + /// Creates a new SparseSuffixToProtein mapping + /// + /// # Arguments + /// * `text` - The text over which we want to create the mapping + /// + /// # Returns + /// + /// Returns a new SparseSuffixToProtein build over the provided text + pub fn new(text: &[u8]) -> Self { + let mut suffix_index_to_protein: Vec = vec![0]; + for (index, &char) in text.iter().enumerate() { + if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { + suffix_index_to_protein.push(index as i64 + 1); + } + } + suffix_index_to_protein.shrink_to_fit(); + SparseSuffixToProtein { mapping: suffix_index_to_protein } + } + +} + + +#[cfg(test)] +mod tests { + use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; + use crate::Nullable; + use crate::suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}; + + fn build_text() -> Vec { + let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char)); + text.push(TERMINATION_CHARACTER as char); + text.into_bytes() + } + + #[test] + fn test_dense_build() { + let u8_text = &build_text(); + let index = DenseSuffixToProtein::new(u8_text); + let expected = DenseSuffixToProtein {mapping: vec![0, 0, 0, u32::NULL, 1, 1, u32::NULL, 2, 2, 2, u32::NULL]}; + assert_eq!(index, expected); + } + + #[test] + fn test_sparse_build() { + let u8_text = &build_text(); + let index = SparseSuffixToProtein::new(u8_text); + let expected = SparseSuffixToProtein {mapping: vec![0, 4, 7, 11]}; + assert_eq!(index, expected); + } + + #[test] + fn test_search_dense() { + let u8_text = &build_text(); + let index = DenseSuffixToProtein::new(u8_text); + assert_eq!(index.suffix_to_protein(5), 1); + assert_eq!(index.suffix_to_protein(7), 2); + // suffix that starts with SEPARATION_CHARACTER + assert_eq!(index.suffix_to_protein(3), u32::NULL); + // suffix that starts with TERMINATION_CHARACTER + assert_eq!(index.suffix_to_protein(10), u32::NULL); + } + + #[test] + fn test_search_sparse() { + let u8_text = &build_text(); + let index = SparseSuffixToProtein::new(u8_text); + assert_eq!(index.suffix_to_protein(5), 1); + assert_eq!(index.suffix_to_protein(7), 2); + // suffix that starts with SEPARATION_CHARACTER + assert_eq!(index.suffix_to_protein(3), u32::NULL); + // suffix that starts with TERMINATION_CHARACTER + assert_eq!(index.suffix_to_protein(10), u32::NULL); + } +} \ No newline at end of file diff --git a/sa-index/src/util.rs b/sa-index/src/util.rs new file mode 100644 index 0000000..821bdc7 --- /dev/null +++ b/sa-index/src/util.rs @@ -0,0 +1,55 @@ +use std::fs::File; +use std::io; +use std::io::BufRead; +use std::path::Path; +use std::time::{SystemTime, SystemTimeError, UNIX_EPOCH}; + +use crate::sa_searcher::Searcher; + +/// Gets the current time in ms +/// +/// # Returns +/// +/// Returns the current time in ms +/// +/// # Errors +/// +/// Returns a SystemTimeError if getting the current time somehow fails +#[allow(unused)] +pub fn get_time_ms() -> Result { + Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6) +} + +/// Times how long the function `f`, that has the searcher as only argument executed +/// +/// # Returns +/// +/// Returns the execution time of `f`in ms +/// +/// # Errors +/// +/// Returns a SystemTimeError if getting the start or end time failed +#[allow(unused)] +pub fn time_execution( + searcher: &mut Searcher, + f: &dyn Fn(&mut Searcher) -> bool, +) -> Result<(bool, f64), SystemTimeError> { + let start_ms = get_time_ms()?; + let found = f(searcher); + let end_ms = get_time_ms()?; + Ok((found, end_ms - start_ms)) +} + +/// Opens `filename` and creates an iterator over it per line +/// +/// # Arguments +/// * `filename` - The file we want to iterate over per line +/// +/// # Returns +/// +/// Returns an Iterator to the Reader of the lines of the file. +pub fn read_lines

(filename: P) -> io::Result>> + where P: AsRef, { + let file = File::open(filename)?; + Ok(io::BufReader::new(file).lines()) +} diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml index f02934a..6573fb0 100644 --- a/sa-mappings/Cargo.toml +++ b/sa-mappings/Cargo.toml @@ -12,3 +12,6 @@ tempdir = "0.3.7" fa-compression = { path = "../fa-compression" } bytelines = "2.5.0" umgap = "1.1.0" +serde_json = "1.0.115" +serde = { version = "1.0.197", features = ["derive"] } + diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs index b26152b..b4c9c14 100644 --- a/sa-mappings/src/functionality.rs +++ b/sa-mappings/src/functionality.rs @@ -1,8 +1,21 @@ //! This module contains the FunctionAggregator struct that is responsible for aggregating the //! functional annotations of proteins. +use std::collections::{HashMap, HashSet}; +use serde::Serialize; + + use crate::proteins::Protein; +/// A struct that represents the functional annotations once aggregated +#[derive(Debug, Serialize)] +pub struct FunctionalAggregation { + /// A HashMap representing how many GO, EC and IPR terms were found + pub counts: HashMap, + /// A HashMap representing how often a certain functional annotation was found + pub data: HashMap, +} + /// A struct that represents a function aggregator pub struct FunctionAggregator {} @@ -14,46 +27,59 @@ impl FunctionAggregator { /// /// # Returns /// - /// Returns a string containing the aggregated functional annotations - pub fn aggregate(&self, proteins: Vec) -> String { - proteins - .iter() - .map(|protein| protein.get_functional_annotations()) - .collect::>() - .join(";") + /// Returns a JSON string containing the aggregated functional annotations + pub fn aggregate(&self, proteins: Vec<&Protein>) -> FunctionalAggregation { + // Keep track of the proteins that have a certain annotation + let mut proteins_with_ec: HashSet = HashSet::new(); + let mut proteins_with_go: HashSet = HashSet::new(); + let mut proteins_with_ipr: HashSet = HashSet::new(); + + // Keep track of the counts of the different annotations + let mut data: HashMap = HashMap::new(); + + for protein in proteins.iter() { + for annotation in protein.get_functional_annotations().split(';') { + match annotation.chars().next() { + Some('E') => proteins_with_ec.insert(protein.uniprot_id.clone()), + Some('G') => proteins_with_go.insert(protein.uniprot_id.clone()), + Some('I') => proteins_with_ipr.insert(protein.uniprot_id.clone()), + _ => false + }; + + data.entry(annotation.to_string()).and_modify(|c| *c += 1).or_insert(1); + } + } + + let mut counts: HashMap = HashMap::new(); + counts.insert("all".to_string(), proteins.len()); + counts.insert("EC".to_string(), proteins_with_ec.len()); + counts.insert("GO".to_string(), proteins_with_go.len()); + counts.insert("IPR".to_string(), proteins_with_ipr.len()); + + data.remove(""); + + FunctionalAggregation { counts, data } } -} -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_aggregate() { - let proteins = vec![ - Protein { - uniprot_id: "uniprot1".to_string(), - sequence: (0, 3), - taxon_id: 1, - functional_annotations: vec![ - 0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27, - ] - }, - Protein { - uniprot_id: "uniprot2".to_string(), - sequence: (4, 3), - taxon_id: 2, - functional_annotations: vec![ - 0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27, - ] - }, - ]; - - let function_aggregator = FunctionAggregator {}; - - assert_eq!( - function_aggregator.aggregate(proteins), - "GO:0009279;IPR:IPR016364;IPR:IPR008816;GO:0009279;IPR:IPR016364;IPR:IPR008816" - ); + + /// Aggregates the functional annotations of proteins + /// + /// # Arguments + /// * `proteins` - A vector of proteins + /// + /// # Returns + /// + /// Returns a list of lists with all the functional annotations per protein + pub fn get_all_functional_annotations(&self, proteins: &[&Protein]) -> Vec> { + proteins + .iter() + .map( + |&prot| + prot.get_functional_annotations() + .split(';') + .map(|ann| ann.to_string()) + .collect() + ) + .collect::>>() } } diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index a79dd3d..cda9b94 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -27,9 +27,6 @@ pub struct Protein { /// The id of the protein pub uniprot_id: String, - /// start position and length of the protein in the input string - pub sequence: (usize, u32), - /// the taxon id of the protein pub taxon_id: TaxonId, @@ -40,7 +37,7 @@ pub struct Protein { /// A struct that represents a collection of proteins pub struct Proteins { /// The input string containing all proteins - input_string: Vec, + pub input_string: Vec, /// The proteins in the input string proteins: Vec @@ -76,8 +73,6 @@ impl Proteins { let file = File::open(file)?; - let mut start_index = 0; - // Read the lines as bytes, since the input string is not guaranteed to be utf8 // because of the encoded functional annotations let mut lines = ByteLines::new(BufReader::new(file)); @@ -100,38 +95,68 @@ impl Proteins { proteins.push(Protein { uniprot_id: uniprot_id.to_string(), - sequence: (start_index, sequence.len() as u32), taxon_id, functional_annotations }); - start_index += sequence.len() + 1; } input_string.pop(); input_string.push(TERMINATION_CHARACTER.into()); - + input_string.shrink_to_fit(); + proteins.shrink_to_fit(); Ok(Self { input_string: input_string.into_bytes(), proteins }) } - /// Returns the sequence of a protein + /// Creates a `vec` which represents all the proteins concatenated from the database file /// /// # Arguments - /// * `protein` - The protein to get the sequence from + /// * `file` - The path to the database file + /// * `taxon_aggregator` - The `TaxonAggregator` to use /// /// # Returns /// - /// Returns a string slice containing the sequence of the protein - pub fn get_sequence(&self, protein: &Protein) -> &str { - let (start, length) = protein.sequence; - let end = start + length as usize; + /// Returns a `Result` containing the `Vec` + /// + /// # Errors + /// + /// Returns a `Box` if an error occurred while reading the database file + pub fn try_from_database_file_without_annotations(database_file: &str, taxon_aggregator: &TaxonAggregator) -> Result, Box> { + let mut input_string: String = String::new(); + + let file = File::open(database_file)?; + + // Read the lines as bytes, since the input string is not guaranteed to be utf8 + // because of the encoded functional annotations + let mut lines = ByteLines::new(BufReader::new(file)); + + while let Some(Ok(line)) = lines.next() { + let mut fields = line.split(|b| *b == b'\t'); + + // only get the taxon id and sequence from each line, we don't need the other parts + fields.next(); + let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; + let sequence = from_utf8(fields.next().unwrap())?; + fields.next(); + + if !taxon_aggregator.taxon_exists(taxon_id) { + continue; + } + + input_string.push_str(&sequence.to_uppercase()); + input_string.push(SEPARATION_CHARACTER.into()); + } + + input_string.pop(); + input_string.push(TERMINATION_CHARACTER.into()); - // unwrap should never fail since the input string will always be utf8 - std::str::from_utf8(&self.input_string[start .. end]).unwrap() + input_string.shrink_to_fit(); + Ok(input_string.into_bytes()) } + } impl Index for Proteins { @@ -210,13 +235,11 @@ mod tests { fn test_new_protein() { let protein = Protein { uniprot_id: "P12345".to_string(), - sequence: (0, 3), taxon_id: 1, functional_annotations: vec![0xD1, 0x11] }; assert_eq!(protein.uniprot_id, "P12345"); - assert_eq!(protein.sequence, (0, 3)); assert_eq!(protein.taxon_id, 1); assert_eq!(protein.functional_annotations, vec![0xD1, 0x11]); } @@ -230,13 +253,11 @@ mod tests { proteins: vec![ Protein { uniprot_id: "P12345".to_string(), - sequence: (0, 3), taxon_id: 1, functional_annotations: vec![0xD1, 0x11] }, Protein { uniprot_id: "P54321".to_string(), - sequence: (4, 3), taxon_id: 2, functional_annotations: vec![0xD1, 0x11] }, @@ -249,19 +270,17 @@ mod tests { ); assert_eq!(proteins.proteins.len(), 2); assert_eq!(proteins.proteins[0].uniprot_id, "P12345"); - assert_eq!(proteins.proteins[0].sequence, (0, 3)); assert_eq!(proteins.proteins[0].taxon_id, 1); assert_eq!(proteins.proteins[0].functional_annotations, vec![0xD1, 0x11]); assert_eq!(proteins.proteins[1].uniprot_id, "P54321"); - assert_eq!(proteins.proteins[1].sequence, (4, 3)); assert_eq!(proteins.proteins[1].taxon_id, 2); assert_eq!(proteins.proteins[1].functional_annotations, vec![0xD1, 0x11]); } #[test] - fn test_get_sequence() { + fn test_get_taxon() { // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_get_sequences").unwrap(); + let tmp_dir = TempDir::new("test_get_taxon").unwrap(); let database_file = create_database_file(&tmp_dir); let taxonomy_file = create_taxonomy_file(&tmp_dir); @@ -270,25 +289,21 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::Lca ) - .unwrap(); + .unwrap(); let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) .unwrap(); - //assert_eq!(proteins.proteins.len(), 4); - assert_eq!(proteins.get_sequence(&proteins[0]), "MLPGLALLLLAAWTARALEV"); - assert_eq!(proteins.get_sequence(&proteins[1]), "PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG"); - assert_eq!(proteins.get_sequence(&proteins[2]), "KWDSDPSGTKTCIDT"); - assert_eq!( - proteins.get_sequence(&proteins[3]), - "KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH" - ); + let taxa = vec![1, 2, 6, 17]; + for (i, protein) in proteins.proteins.iter().enumerate() { + assert_eq!(protein.taxon_id, taxa[i]); + } } #[test] - fn test_get_taxon() { + fn test_get_functional_annotations() { // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_get_taxon").unwrap(); + let tmp_dir = TempDir::new("test_get_fa").unwrap(); let database_file = create_database_file(&tmp_dir); let taxonomy_file = create_taxonomy_file(&tmp_dir); @@ -297,19 +312,21 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::Lca ) - .unwrap(); + .unwrap(); let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) .unwrap(); - let taxa = vec![1, 2, 6, 17]; - for (i, protein) in proteins.proteins.iter().enumerate() { - assert_eq!(protein.taxon_id, taxa[i]); + for protein in proteins.proteins.iter() { + assert_eq!( + decode(&protein.functional_annotations), + "GO:0009279;IPR:IPR016364;IPR:IPR008816" + ); } } #[test] - fn test_get_functional_annotations() { + fn test_get_concatenated_proteins() { // Create a temporary directory for this test let tmp_dir = TempDir::new("test_get_fa").unwrap(); @@ -320,16 +337,14 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::Lca ) - .unwrap(); + .unwrap(); let proteins = - Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) + Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap(), &taxon_aggregator) .unwrap(); - for protein in proteins.proteins.iter() { - assert_eq!( - decode(&protein.functional_annotations), - "GO:0009279;IPR:IPR016364;IPR:IPR008816" - ); - } + let sep_char = SEPARATION_CHARACTER as char; + let end_char = TERMINATION_CHARACTER as char; + let expected = format!("MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}", sep_char, sep_char, sep_char, end_char); + assert_eq!(proteins, expected.as_bytes()); } } diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs index ada93ff..9ba1e26 100644 --- a/sa-mappings/src/taxonomy.rs +++ b/sa-mappings/src/taxonomy.rs @@ -91,6 +91,23 @@ impl TaxonAggregator { self.taxon_list.get(taxon).is_some() } + /// Checks if a taxon is valid to be used during taxonomic aggregation + /// + /// # Arguments + /// + /// * `taxon` - The taxon ID to check. + /// + /// # Returns + /// + /// Returns a boolean value indicating whether the taxon exists and is valid + pub fn taxon_valid(&self, taxon: TaxonId) -> bool { + let optional_taxon = self.taxon_list.get(taxon); + match optional_taxon { + None => false, + Some(taxon) => taxon.valid + } + } + /// Snaps a taxon to its closest ancestor in the taxonomic tree. /// /// # Arguments @@ -109,15 +126,22 @@ impl TaxonAggregator { /// # Arguments /// /// * `taxa` - A vector of taxon IDs to aggregate. + /// * `clean_taxa` - If true, only the taxa which are stored as "valid" are used during aggregation /// /// # Returns /// - /// Returns the aggregated taxon ID, or panics if aggregation fails. - pub fn aggregate(&self, taxa: Vec) -> TaxonId { - let count = count(taxa.into_iter().map(|t| (t, 1.0))); - self.aggregator + /// Returns the aggregated taxon ID wrapped in Some if aggregation succeeds, + /// Returns None if the list of taxa to aggregate is emtpy, + /// Panics if aggregation fails. + pub fn aggregate(&self, taxa: Vec, ) -> Option { + if taxa.is_empty() { + return None + } + + let count = count(taxa.into_iter().map(|t| (t, 1.0_f32))); + Some(self.aggregator .aggregate(&count) - .unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count)) + .unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count))) } } @@ -151,6 +175,7 @@ mod tests { writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap(); writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap(); writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap(); + writeln!(file, "21\tInvalid\tspecies\t19\t\x00").unwrap(); taxonomy_file } @@ -166,12 +191,12 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::Lca ) - .unwrap(); + .unwrap(); let _ = TaxonAggregator::try_from_taxonomy_file( taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar ) - .unwrap(); + .unwrap(); } #[test] @@ -185,7 +210,7 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::Lca ) - .unwrap(); + .unwrap(); for i in 0 ..= 20 { if [0, 3, 4, 5, 8, 12, 15].contains(&i) { @@ -207,7 +232,7 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::Lca ) - .unwrap(); + .unwrap(); for i in 0 ..= 20 { if ![0, 3, 4, 5, 8, 12, 15].contains(&i) { @@ -227,11 +252,11 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::Lca ) - .unwrap(); + .unwrap(); - assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), 6); - assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), 10); - assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), 17); + assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), Some(6)); + assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), Some(10)); + assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), Some(17)); } #[test] @@ -245,10 +270,10 @@ mod tests { taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar ) - .unwrap(); + .unwrap(); - assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), 6); - assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), 10); - assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), 19); + assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), Some(6)); + assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), Some(10)); + assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), Some(19)); } } diff --git a/sa-server/Cargo.toml b/sa-server/Cargo.toml index 18698eb..c1920f5 100644 --- a/sa-server/Cargo.toml +++ b/sa-server/Cargo.toml @@ -6,3 +6,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +axum = { version = "0.7.4", features = ["macros"] } +serde = { version = "1.0.197", features = ["derive"] } +tokio = { version = "1.36.0", features = ["rt", "rt-multi-thread", "macros"] } +sa-index = { path = "../sa-index" } +clap = { version = "4.5.1", features = ["derive"] } +sa-builder = { path = "../sa-builder" } +sa-mappings = { path = "../sa-mappings" } \ No newline at end of file diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index e7a11a9..52bb657 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -1,3 +1,184 @@ -fn main() { - println!("Hello, world!"); +use std::error::Error; +use std::sync::Arc; + +use axum::{http::StatusCode, Json, Router}; +use axum::extract::{DefaultBodyLimit, State}; +use axum::routing::{get, post}; +use clap::Parser; +use serde::{Deserialize, Serialize}; + +use sa_mappings::functionality::FunctionAggregator; +use sa_mappings::proteins::Proteins; +use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator}; +use sa_index::peptide_search::{OutputData, analyse_all_peptides, SearchResultWithAnalysis, SearchOnlyResult, search_all_peptides}; +use sa_index::sa_searcher::Searcher; +use sa_index::suffix_to_protein_index::SparseSuffixToProtein; +use sa_builder::binary::load_suffix_array; + +/// Enum that represents all possible commandline arguments +#[derive(Parser, Debug)] +pub struct Arguments { + /// File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a `#`. + #[arg(short, long)] + database_file: String, + #[arg(short, long)] + index_file: String, + #[arg(short, long)] + /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. + taxonomy: String, +} + +/// Function used by serde to place a default value in the cutoff field of the input +fn default_cutoff() -> usize { + 10000 +} + +/// Function used by serde to use `true` as a default value +#[allow(dead_code)] +fn default_true() -> bool { + true +} + +/// Struct representing the input arguments accepted by the endpoints +/// +/// # Arguments +/// * `peptides` - List of peptides we want to process +/// * `cutoff` - The maximum amount of matches to process, default value 10000 +/// * `equalize_I_and_L` - True if we want to equalize I and L during search +/// * `clean_taxa` - True if we only want to use proteins marked as "valid" +#[derive(Debug, Deserialize, Serialize)] +#[allow(non_snake_case)] +struct InputData { + peptides: Vec, + #[serde(default = "default_cutoff")] // default value is 10000 + cutoff: usize, + #[serde(default = "bool::default")] + // default value is false // TODO: maybe default should be true? + equalize_I_and_L: bool, + #[serde(default = "bool::default")] // default value is false + clean_taxa: bool, +} + +#[tokio::main] +async fn main() { + let args = Arguments::parse(); + if let Err(err) = start_server(args).await { + eprintln!("{}", err); + std::process::exit(1); + } +} + +/// Basic handler used to check the server status +async fn root() -> &'static str { + "Server is online" +} + +/// Endpoint executed for peptide matching and taxonomic and functional analysis +/// +/// # Arguments +/// * `state(searcher)` - The searcher object provided by the server +/// * `data` - InputData object provided by the user with the peptides to be searched and the config +/// +/// # Returns +/// +/// Returns the search and analysis results from the index as a JSON +async fn analyse( + State(searcher): State>, + data: Json, +) -> Result>, StatusCode> { + let search_result = analyse_all_peptides( + &searcher, + &data.peptides, + data.cutoff, + data.equalize_I_and_L, + data.clean_taxa, + ); + + Ok(Json(search_result)) +} + +/// Endpoint executed for peptide matching, without any analysis +/// +/// # Arguments +/// * `state(searcher)` - The searcher object provided by the server +/// * `data` - InputData object provided by the user with the peptides to be searched and the config +/// +/// # Returns +/// +/// Returns the search results from the index as a JSON +async fn search( + State(searcher): State>, + data: Json, +) -> Result>, StatusCode> { + let search_result = search_all_peptides( + &searcher, + &data.peptides, + data.cutoff, + data.equalize_I_and_L, + data.clean_taxa, + ); + + Ok(Json(search_result)) +} + +/// Starts the server with the provided commandline arguments +/// +/// # Arguments +/// * `args` - The provided commandline arguments +/// +/// # Returns +/// +/// Returns () +/// +/// # Errors +/// +/// Returns any error occurring during the startup or uptime of the server +async fn start_server(args: Arguments) -> Result<(), Box> { + let Arguments { + database_file, + index_file, + taxonomy, + } = args; + + eprintln!("Loading suffix array..."); + let (sparseness_factor, sa) = load_suffix_array(&index_file)?; + + eprintln!("Loading taxon file..."); + let taxon_id_calculator = + TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)?; + + let function_aggregator = FunctionAggregator {}; + + eprintln!("Loading proteins..."); + let proteins = Proteins::try_from_database_file(&database_file, &taxon_id_calculator)?; + let suffix_index_to_protein = Box::new(SparseSuffixToProtein::new(&proteins.input_string)); + + eprintln!("Creating searcher..."); + let searcher = Arc::new(Searcher::new( + sa, + sparseness_factor, + suffix_index_to_protein, + proteins, + taxon_id_calculator, + function_aggregator, + )); + + // build our application with a route + let app = Router::new() + // `GET /` goes to `root` + .route("/", get(root)) + // `POST /analyse` goes to `analyse` and set max payload size to 5 MB + .route("/analyse", post(analyse)) + .layer(DefaultBodyLimit::max(5 * 10_usize.pow(6))) + .with_state(searcher.clone()) + // `POST /search` goes to `search` and set max payload size to 5 MB + .route("/search", post(search)) + .layer(DefaultBodyLimit::max(5 * 10_usize.pow(6))) + .with_state(searcher); + + let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await?; + println!("server is ready..."); + axum::serve(listener, app).await?; + + Ok(()) }