diff --git a/.rustfmt.toml b/.rustfmt.toml index cdbf5d1..74a1579 100644 --- a/.rustfmt.toml +++ b/.rustfmt.toml @@ -1,16 +1,15 @@ unstable_features = true version = "Two" -array_width = 90 -comment_width = 100 -imports_layout = "vertical" -imports_granularity = "crate" -fn_call_width = 90 -overflow_delimited_expr = false -reorder_impl_items = true +max_width = 120 +use_small_heuristics = "max" +chain_width = 100 +struct_lit_width = 60 +struct_variant_width = 60 +imports_granularity = "Crate" group_imports = "StdExternalCrate" -spaces_around_ranges = true -struct_field_align_threshold = 20 -struct_lit_single_line = false -trailing_comma = "never" -wrap_comments = true +normalize_comments = true +normalize_doc_attributes = true +overflow_delimited_expr = true +trailing_comma = "Never" +use_field_init_shorthand = true diff --git a/Cargo.lock b/Cargo.lock index 900c218..c29abc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -32,15 +32,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" -[[package]] -name = "ansi_term" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" -dependencies = [ - "winapi", -] - [[package]] name = "anstream" version = "0.6.14" @@ -98,34 +89,7 @@ checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", -] - -[[package]] -name = "attohttpc" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe174d1b67f7b2bafed829c09db039301eb5841f66e43be2cf60b326e7f8e2cc" -dependencies = [ - "flate2", - "http 0.2.12", - "log", - "native-tls", - "openssl", - "serde", - "serde_json", - "url", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi 0.1.19", - "libc", - "winapi", + "syn", ] [[package]] @@ -145,7 +109,7 @@ dependencies = [ "axum-macros", "bytes", "futures-util", - "http 1.1.0", + "http", "http-body", "http-body-util", "hyper", @@ -178,7 +142,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.1.0", + "http", "http-body", "http-body-util", "mime", @@ -199,7 +163,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.58", + "syn", ] [[package]] @@ -223,7 +187,7 @@ version = "0.69.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" dependencies = [ - "bitflags 2.5.0", + "bitflags", "cexpr", "clang-sys", "itertools", @@ -236,7 +200,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.58", + "syn", "which", ] @@ -244,12 +208,6 @@ dependencies = [ name = "bitarray" version = "0.1.0" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -272,12 +230,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "bytes" version = "1.6.0" @@ -349,21 +301,6 @@ dependencies = [ "libloading", ] -[[package]] -name = "clap" -version = "2.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" -dependencies = [ - "ansi_term", - "atty", - "bitflags 1.3.2", - "strsim 0.8.0", - "textwrap", - "unicode-width", - "vec_map", -] - [[package]] name = "clap" version = "4.5.4" @@ -383,7 +320,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim 0.11.1", + "strsim", ] [[package]] @@ -395,7 +332,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.58", + "syn", ] [[package]] @@ -419,31 +356,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "core-foundation-sys" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" - -[[package]] -name = "crc32fast" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" -dependencies = [ - "cfg-if", -] - [[package]] name = "criterion" version = "0.5.1" @@ -453,7 +365,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.4", + "clap", "criterion-plot", "is-terminal", "itertools", @@ -511,27 +423,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" -[[package]] -name = "csv" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" -dependencies = [ - "memchr", -] - [[package]] name = "either" version = "1.10.0" @@ -548,16 +439,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "error-chain" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" -dependencies = [ - "backtrace", - "version_check", -] - [[package]] name = "fa-compression" version = "0.1.0" @@ -566,43 +447,12 @@ dependencies = [ "rand 0.8.5", ] -[[package]] -name = "fastrand" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" - -[[package]] -name = "flate2" -version = "1.0.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" -dependencies = [ - "crc32fast", - "miniz_oxide", -] - [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.2.1" @@ -612,16 +462,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fst" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927fb434ff9f0115b215dc0efd2e4fbdd7448522a92a1aa37c77d6a2f8f1ebd6" -dependencies = [ - "byteorder", - "memmap", -] - [[package]] name = "fuchsia-cprng" version = "0.1.1" @@ -694,15 +534,6 @@ dependencies = [ "crunchy", ] -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - [[package]] name = "heck" version = "0.4.1" @@ -715,15 +546,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - [[package]] name = "hermit-abi" version = "0.3.9" @@ -739,17 +561,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "http" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - [[package]] name = "http" version = "1.1.0" @@ -768,7 +579,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" dependencies = [ "bytes", - "http 1.1.0", + "http", ] [[package]] @@ -779,7 +590,7 @@ checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" dependencies = [ "bytes", "futures-core", - "http 1.1.0", + "http", "http-body", "pin-project-lite", ] @@ -805,7 +616,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.1.0", + "http", "http-body", "httparse", "httpdate", @@ -823,7 +634,7 @@ checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http", "http-body", "hyper", "pin-project-lite", @@ -831,23 +642,13 @@ dependencies = [ "tokio", ] -[[package]] -name = "idna" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - [[package]] name = "is-terminal" version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" dependencies = [ - "hermit-abi 0.3.9", + "hermit-abi", "libc", "windows-sys 0.52.0", ] @@ -950,16 +751,6 @@ version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" -[[package]] -name = "memmap" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "mime" version = "0.3.17" @@ -992,24 +783,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "native-tls" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" -dependencies = [ - "lazy_static", - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - [[package]] name = "nom" version = "7.1.3" @@ -1035,7 +808,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.3.9", + "hermit-abi", "libc", ] @@ -1060,59 +833,6 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "openssl" -version = "0.10.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" -dependencies = [ - "bitflags 2.5.0", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "openssl-probe" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" - -[[package]] -name = "openssl-sys" -version = "0.9.102" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - -[[package]] -name = "ordered-float" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7" -dependencies = [ - "num-traits", -] - [[package]] name = "percent-encoding" version = "2.3.1" @@ -1136,7 +856,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn", ] [[package]] @@ -1151,12 +871,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "pkg-config" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" - [[package]] name = "plotters" version = "0.3.5" @@ -1198,31 +912,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.58", -] - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", + "syn", ] [[package]] @@ -1386,7 +1076,7 @@ version = "0.38.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -1409,7 +1099,7 @@ checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" name = "sa-builder" version = "0.1.0" dependencies = [ - "clap 4.5.4", + "clap", "libdivsufsort-rs", "libsais64-rs", "sa-compression", @@ -1430,14 +1120,13 @@ name = "sa-index" version = "0.1.0" dependencies = [ "bitarray", - "clap 4.5.4", + "clap", "fa-compression", "rayon", "sa-mappings", "serde", "serde_json", "tempdir", - "umgap", ] [[package]] @@ -1446,10 +1135,7 @@ version = "0.1.0" dependencies = [ "bytelines", "fa-compression", - "serde", - "serde_json", "tempdir", - "umgap", ] [[package]] @@ -1457,7 +1143,7 @@ name = "sa-server" version = "0.1.0" dependencies = [ "axum", - "clap 4.5.4", + "clap", "sa-builder", "sa-compression", "sa-index", @@ -1475,38 +1161,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "schannel" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" -dependencies = [ - "windows-sys 0.52.0", -] - -[[package]] -name = "security-framework" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" -dependencies = [ - "bitflags 1.3.2", - "core-foundation", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework-sys" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "serde" version = "1.0.197" @@ -1524,7 +1178,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn", ] [[package]] @@ -1582,71 +1236,12 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "structopt" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" -dependencies = [ - "clap 2.34.0", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" -dependencies = [ - "heck 0.3.3", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "strum" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "530efb820d53b712f4e347916c5e7ed20deb76a4f0457943b3182fb889b06d2c" - -[[package]] -name = "strum_macros" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6e163a520367c465f59e0a61a23cfae3b10b6546d78b6f672a382be79f7110" -dependencies = [ - "heck 0.3.3", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - [[package]] name = "syn" version = "2.0.58" @@ -1680,27 +1275,6 @@ dependencies = [ "remove_dir_all", ] -[[package]] -name = "tempfile" -version = "3.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" -dependencies = [ - "cfg-if", - "fastrand", - "rustix", - "windows-sys 0.52.0", -] - -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width", -] - [[package]] name = "tinytemplate" version = "1.2.1" @@ -1711,21 +1285,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - [[package]] name = "tokio" version = "1.37.0" @@ -1751,7 +1310,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn", ] [[package]] @@ -1802,95 +1361,18 @@ dependencies = [ "once_cell", ] -[[package]] -name = "umgap" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db403119ca827e0e097aa3268283072a373e65733f0c9edcdec06ea163741421" -dependencies = [ - "attohttpc", - "clap 2.34.0", - "csv", - "error-chain", - "fst", - "lazy_static", - "ordered-float", - "rayon", - "regex", - "serde_json", - "structopt", - "strum", - "strum_macros", -] - -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" -[[package]] -name = "unicode-normalization" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "unicode-segmentation" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" - -[[package]] -name = "unicode-width" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" - -[[package]] -name = "url" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", -] - [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - [[package]] name = "walkdir" version = "2.5.0" @@ -1928,7 +1410,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.58", + "syn", "wasm-bindgen-shared", ] @@ -1950,7 +1432,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index 2609c3b..e7265cd 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -1,11 +1,6 @@ //! This module provides utilities for reading and writing the bitarray as binary. -use std::io::{ - BufRead, - Read, - Result, - Write -}; +use std::io::{BufRead, Read, Result, Write}; use crate::BitArray; @@ -69,9 +64,8 @@ impl Binary for BitArray { loop { let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; - for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { - self.data - .push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { + self.data.push(u64::from_le_bytes(buffer_slice.try_into().unwrap())); } if finished { @@ -106,16 +100,13 @@ fn fill_buffer(input: &mut T, buffer: &mut Vec) -> std::io::Result< // No bytes written, which means we've completely filled the buffer // or we've reached the end of the file Ok(0) => { - return Ok(( - !writable_buffer_space.is_empty(), - buffer_size - writable_buffer_space.len() - )); + return Ok((!writable_buffer_space.is_empty(), buffer_size - writable_buffer_space.len())); } // We've read {bytes_read} bytes Ok(bytes_read) => { // Shrink the writable buffer slice - writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); } // An error occurred while reading @@ -176,20 +167,17 @@ mod tests { let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); - assert_eq!( - buffer, - vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, - 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ] - ); + assert_eq!(buffer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); } #[test] fn test_read_binary() { let buffer = vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, - 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0, + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0, ]; let mut bitarray = BitArray::with_capacity(4, 40); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 0a5f647..655d17e 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -4,10 +4,7 @@ mod binary; use std::{ cmp::max, - io::{ - Result, - Write - } + io::{Result, Write} }; /// Re-export the `Binary` trait. @@ -16,11 +13,11 @@ pub use binary::Binary; /// A fixed-size bit array implementation. pub struct BitArray { /// The underlying data storage for the bit array. - data: Vec, + data: Vec, /// The mask used to extract the relevant bits from each element in the data vector. - mask: u64, + mask: u64, /// The length of the bit array. - len: usize, + len: usize, /// The number of bits in a single element of the data vector. bits_per_value: usize } @@ -37,11 +34,7 @@ impl BitArray { /// /// A new `BitArray` with the specified capacity. pub fn with_capacity(capacity: usize, bits_per_value: usize) -> Self { - let extra = if capacity * bits_per_value % 64 == 0 { - 0 - } else { - 1 - }; + let extra = if capacity * bits_per_value % 64 == 0 { 0 } else { 1 }; Self { data: vec![0; capacity * bits_per_value / 64 + extra], mask: (1 << bits_per_value) - 1, @@ -67,8 +60,7 @@ impl BitArray { if start_block_offset + self.bits_per_value <= 64 { // Shift the value to the right so that the relevant bits are in the least significant // position Then mask out the irrelevant bits - return self.data[start_block] >> (64 - start_block_offset - self.bits_per_value) - & self.mask; + return self.data[start_block] >> (64 - start_block_offset - self.bits_per_value) & self.mask; } let end_block = (index + 1) * self.bits_per_value / 64; @@ -99,8 +91,7 @@ impl BitArray { // If the value is contained within a single block if start_block_offset + self.bits_per_value <= 64 { // Clear the relevant bits in the start block - self.data[start_block] &= - !(self.mask << (64 - start_block_offset - self.bits_per_value)); + self.data[start_block] &= !(self.mask << (64 - start_block_offset - self.bits_per_value)); // Set the relevant bits in the start block self.data[start_block] |= value << (64 - start_block_offset - self.bits_per_value); return; @@ -176,8 +167,7 @@ pub fn data_to_writer( // Update the max capacity to be a multiple of the greatest common divisor of the bits per value // and 64. This is done to ensure that the bit array can store the data entirely let greates_common_divisor = gcd(bits_per_value, 64); - let capacity = - max(greates_common_divisor, max_capacity / greates_common_divisor * greates_common_divisor); + let capacity = max(greates_common_divisor, max_capacity / greates_common_divisor * greates_common_divisor); // If amount of data is less than the max capacity, write the data to the writer in a single // chunk @@ -316,88 +306,69 @@ mod tests { data_to_writer(data, 40, 2, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, - 0x23, 0x01, 0x00, 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ] - ); + assert_eq!(writer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); } #[test] fn test_data_to_writer_chunks_needed_no_remainder() { let data = vec![ - 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, - 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, - 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, - 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, - 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, - 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, - 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, - 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, - 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, + 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, + 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, + 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, + 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, + 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, + 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, + 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, ]; let mut writer = Vec::new(); data_to_writer(data, 32, 8, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, - 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, - 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, - 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, - 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, - 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, - 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, - 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, - 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, - 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, - 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, - 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, - 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, - 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, - 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, - 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, - 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff - ] - ); + assert_eq!(writer, vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, + 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, + 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, + 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, + 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, + 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, + 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, + 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, + 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, + 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, + 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff + ]); } #[test] fn test_data_to_writer_chunks_needed_plus_remainder() { let data = vec![ - 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, - 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, - 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, - 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, - 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, + 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, + 0xaaaaaaaa, 0xbbbbbbbb, 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, + 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888, 0x99999999, 0xaaaaaaaa, 0xbbbbbbbb, + 0xcccccccc, 0xdddddddd, 0xeeeeeeee, 0xffffffff, 0x00000000, 0x11111111, 0x22222222, 0x33333333, ]; let mut writer = Vec::new(); data_to_writer(data, 32, 8, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, - 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, - 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, - 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, - 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, - 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, - 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, - 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, - 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, - 0x33, 0x33, 0x33, 0x33 - ] - ); + assert_eq!(writer, vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, + 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, + 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, + 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, + 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, + 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, 0x33, 0x33, 0x33, 0x33 + ]); } #[test] diff --git a/fa-compression/benches/algorithm1/decode.rs b/fa-compression/benches/algorithm1/decode.rs index 07bdc41..24991dc 100644 --- a/fa-compression/benches/algorithm1/decode.rs +++ b/fa-compression/benches/algorithm1/decode.rs @@ -1,8 +1,5 @@ use criterion::black_box; -use fa_compression::algorithm1::{ - decode, - encode -}; +use fa_compression::algorithm1::{decode, encode}; use super::util::generate_annotation; @@ -11,7 +8,7 @@ fn generate_encoded_annotations(count: usize) -> Vec { let mut random = rand::thread_rng(); let mut annotations = String::new(); - for _ in 0 .. count { + for _ in 0..count { annotations.push_str(&generate_annotation(&mut random)); annotations.push(';'); } diff --git a/fa-compression/benches/algorithm1/encode.rs b/fa-compression/benches/algorithm1/encode.rs index e134c6a..d3a9c86 100644 --- a/fa-compression/benches/algorithm1/encode.rs +++ b/fa-compression/benches/algorithm1/encode.rs @@ -8,7 +8,7 @@ fn generate_decoded_annotations(count: usize) -> String { let mut random = rand::thread_rng(); let mut annotations = String::new(); - for _ in 0 .. count { + for _ in 0..count { annotations.push_str(&generate_annotation(&mut random)); annotations.push(';'); } diff --git a/fa-compression/benches/algorithm2/decode.rs b/fa-compression/benches/algorithm2/decode.rs index a70d6b4..4d562fc 100644 --- a/fa-compression/benches/algorithm2/decode.rs +++ b/fa-compression/benches/algorithm2/decode.rs @@ -1,9 +1,5 @@ use criterion::black_box; -use fa_compression::algorithm2::{ - decode, - encode, - CompressionTable -}; +use fa_compression::algorithm2::{decode, encode, CompressionTable}; use super::util::generate_annotation; @@ -14,7 +10,7 @@ fn generate_encoded_annotations_and_table(count: usize) -> (Vec, Compression let mut compression_table2 = CompressionTable::new(); let mut annotations = String::new(); - for _ in 0 .. count { + for _ in 0..count { let annotation = generate_annotation(&mut random); annotations.push_str(&annotation); annotations.push(';'); diff --git a/fa-compression/benches/algorithm2/encode.rs b/fa-compression/benches/algorithm2/encode.rs index e1729f7..827dd50 100644 --- a/fa-compression/benches/algorithm2/encode.rs +++ b/fa-compression/benches/algorithm2/encode.rs @@ -1,8 +1,5 @@ use criterion::black_box; -use fa_compression::algorithm2::{ - encode, - CompressionTable -}; +use fa_compression::algorithm2::{encode, CompressionTable}; use super::util::generate_annotation; @@ -12,7 +9,7 @@ fn generate_decoded_annotations_and_table(count: usize) -> (String, CompressionT let mut compression_table = CompressionTable::new(); let mut annotations = String::new(); - for _ in 0 .. count { + for _ in 0..count { let annotation = generate_annotation(&mut random); annotations.push_str(&annotation); annotations.push(';'); diff --git a/fa-compression/benches/util.rs b/fa-compression/benches/util.rs index 47d9990..b6ddd9a 100644 --- a/fa-compression/benches/util.rs +++ b/fa-compression/benches/util.rs @@ -1,32 +1,29 @@ -use rand::{ - rngs::ThreadRng, - Rng -}; +use rand::{rngs::ThreadRng, Rng}; /// Generate a random InterPro annotation. pub fn generate_ipr(random: &mut ThreadRng) -> String { - format!("IPR:IPR{:06}", random.gen_range(0 .. 999999)) + format!("IPR:IPR{:06}", random.gen_range(0..999999)) } /// Generate a random Gene Ontology annotation. pub fn generate_go(random: &mut ThreadRng) -> String { - format!("GO:{:07}", random.gen_range(0 .. 9999999)) + format!("GO:{:07}", random.gen_range(0..9999999)) } /// Generate a random Enzyme Commission annotation. pub fn generate_ec(random: &mut ThreadRng) -> String { format!( "EC:{}.{}.{}.{}", - random.gen_range(0 .. 8), - random.gen_range(0 .. 30), - random.gen_range(0 .. 30), - random.gen_range(0 .. 200) + random.gen_range(0..8), + random.gen_range(0..30), + random.gen_range(0..30), + random.gen_range(0..200) ) } /// Generate a random annotation. pub fn generate_annotation(random: &mut ThreadRng) -> String { - match random.gen_range(0 .. 3) { + match random.gen_range(0..3) { 0 => generate_ipr(random), 1 => generate_go(random), 2 => generate_ec(random), diff --git a/fa-compression/src/algorithm1/decode.rs b/fa-compression/src/algorithm1/decode.rs index 5295a58..01f8e86 100644 --- a/fa-compression/src/algorithm1/decode.rs +++ b/fa-compression/src/algorithm1/decode.rs @@ -1,10 +1,7 @@ //! This module provides a function to decode a byte array into a string representation of //! annotations. -use super::{ - CharacterSet, - Decode -}; +use super::{CharacterSet, Decode}; /// The prefixes for the different types of annotations. static PREFIXES: [&str; 3] = ["EC:", "GO:", "IPR:IPR"]; @@ -53,11 +50,7 @@ pub fn decode(input: &[u8]) -> String { // Given the additional prefixes, we can safely triple the space. This might // allocate more than necessary, but it's a simple and fast solution. let mut result = String::with_capacity(input.len() * 3); - for (annotations, prefix) in decoded - .split(',') - .zip(PREFIXES) - .filter(|(s, _)| !s.is_empty()) - { + for (annotations, prefix) in decoded.split(',').zip(PREFIXES).filter(|(s, _)| !s.is_empty()) { for annotation in annotations.split(';') { result.push_str(prefix); result.push_str(annotation); @@ -97,34 +90,23 @@ mod tests { #[test] fn test_decode_no_ec() { - assert_eq!( - decode(&[225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39]), - "GO:0009279;IPR:IPR016364;IPR:IPR008816" - ) + assert_eq!(decode(&[225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39]), "GO:0009279;IPR:IPR016364;IPR:IPR008816") } #[test] fn test_decode_no_go() { - assert_eq!( - decode(&[44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80]), - "EC:1.1.1.-;EC:1.2.1.7;IPR:IPR016364" - ) + assert_eq!(decode(&[44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80]), "EC:1.1.1.-;EC:1.2.1.7;IPR:IPR016364") } #[test] fn test_decode_no_ipr() { - assert_eq!( - decode(&[44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174]), - "EC:1.1.1.-;GO:0009279;GO:0009279" - ) + assert_eq!(decode(&[44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174]), "EC:1.1.1.-;GO:0009279;GO:0009279") } #[test] fn test_decode_all() { assert_eq!( - decode(&[ - 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39 - ]), + decode(&[44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39]), "EC:1.1.1.-;GO:0009279;IPR:IPR016364;IPR:IPR032635;IPR:IPR008816" ) } diff --git a/fa-compression/src/algorithm1/encode.rs b/fa-compression/src/algorithm1/encode.rs index 0877c9a..ef79372 100644 --- a/fa-compression/src/algorithm1/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -1,9 +1,6 @@ //! This module contains the function to encode the input string into a compressed byte vector. -use super::{ - CharacterSet, - Encode -}; +use super::{CharacterSet, Encode}; /// Encodes the input string into a compressed byte vector. /// @@ -50,11 +47,11 @@ pub fn encode(input: &str) -> Vec { // Read the input and split the annotations into the corresponding vectors for annotation in input.split(';') { if annotation.starts_with("IPR") { - interpros.push(&annotation[7 ..]); + interpros.push(&annotation[7..]); } else if annotation.starts_with("GO") { - gos.push(&annotation[3 ..]); + gos.push(&annotation[3..]); } else if annotation.starts_with("EC") { - ecs.push(&annotation[3 ..]); + ecs.push(&annotation[3..]); } } @@ -109,33 +106,27 @@ mod tests { #[test] fn test_encode_no_ec() { - assert_eq!( - encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), - vec![225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39] - ) + assert_eq!(encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), vec![ + 225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39 + ]) } #[test] fn test_encode_no_go() { - assert_eq!( - encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), - vec![44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80] - ) + assert_eq!(encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), vec![ + 44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80 + ]) } #[test] fn test_encode_no_ipr() { - assert_eq!( - encode("EC:1.1.1.-;GO:0009279;GO:0009279"), - vec![44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174] - ) + assert_eq!(encode("EC:1.1.1.-;GO:0009279;GO:0009279"), vec![44, 44, 44, 190, 17, 26, 56, 175, 17, 26, 56, 174]) } #[test] fn test_encode_all() { - assert_eq!( - encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), - vec![44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39] - ) + assert_eq!(encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), vec![ + 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39 + ]) } } diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs index a495c9e..cdf7283 100644 --- a/fa-compression/src/algorithm1/mod.rs +++ b/fa-compression/src/algorithm1/mod.rs @@ -170,10 +170,8 @@ impl BitOr for CharacterSet { mod tests { use super::*; - static CHARACTERS: [u8; 16] = [ - b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', - b';' - ]; + static CHARACTERS: [u8; 16] = + [b'$', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'-', b'.', b'n', b',', b';']; static CHARACTER_SETS: [CharacterSet; 16] = [ CharacterSet::Empty, @@ -196,8 +194,8 @@ mod tests { #[test] fn test_or() { - for i in 0 .. CHARACTERS.len() { - for j in 0 .. CHARACTERS.len() { + for i in 0..CHARACTERS.len() { + for j in 0..CHARACTERS.len() { assert_eq!(CHARACTER_SETS[i] | CHARACTER_SETS[j], ((i as u8) << 4) | (j as u8)); } } @@ -205,7 +203,7 @@ mod tests { #[test] fn test_encode() { - for i in 0 .. CHARACTERS.len() { + for i in 0..CHARACTERS.len() { assert_eq!(CHARACTER_SETS[i], CharacterSet::encode(CHARACTERS[i])); } } diff --git a/fa-compression/src/algorithm2/encode.rs b/fa-compression/src/algorithm2/encode.rs index d52844e..f55eb11 100644 --- a/fa-compression/src/algorithm2/encode.rs +++ b/fa-compression/src/algorithm2/encode.rs @@ -34,7 +34,7 @@ pub fn encode(input: &str, compression_table: CompressionTable) -> Vec { let mut encoded: Vec = Vec::with_capacity(input.len() / 3); for annotation in input.split(';') { if let Some(index) = compression_table.index_of(annotation) { - encoded.extend_from_slice(&index.to_le_bytes()[0 .. 3]) + encoded.extend_from_slice(&index.to_le_bytes()[0..3]) } } @@ -89,9 +89,8 @@ mod tests { #[test] fn test_encode_all() { let table = create_compresion_table(); - assert_eq!( - encode("IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002", table), - vec![0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0] - ) + assert_eq!(encode("IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002", table), vec![ + 0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0 + ]) } } diff --git a/fa-compression/src/algorithm2/mod.rs b/fa-compression/src/algorithm2/mod.rs index be08fe4..8fc505a 100644 --- a/fa-compression/src/algorithm2/mod.rs +++ b/fa-compression/src/algorithm2/mod.rs @@ -36,9 +36,7 @@ impl CompressionTable { /// let table = CompressionTable::new(); /// ``` pub fn new() -> CompressionTable { - CompressionTable { - entries: Vec::new() - } + CompressionTable { entries: Vec::new() } } /// Adds a new entry to the compression table. @@ -57,16 +55,12 @@ impl CompressionTable { /// table.add_entry("IPR:IPR000002".to_string()); /// ``` pub fn add_entry(&mut self, annotation: String) { - self.entries.push(CompressionTableEntry { - annotation - }); + self.entries.push(CompressionTableEntry { annotation }); } /// Returns the index of the given annotation in the compression table, if it exists. fn index_of(&self, annotation: &str) -> Option { - self.entries - .iter() - .position(|entry| entry.annotation == annotation) + self.entries.iter().position(|entry| entry.annotation == annotation) } } diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs index 78851a1..5b3feb2 100644 --- a/libsais64-rs/builder.rs +++ b/libsais64-rs/builder.rs @@ -1,24 +1,15 @@ use std::{ env, error::Error, - fmt::{ - Display, - Formatter - }, - path::{ - Path, - PathBuf - }, - process::{ - Command, - ExitStatus - } + fmt::{Display, Formatter}, + path::{Path, PathBuf}, + process::{Command, ExitStatus} }; /// Custom error for compilation of the C library #[derive(Debug)] struct CompileError<'a> { - command: &'a str, + command: &'a str, exit_code: Option } @@ -52,19 +43,13 @@ impl<'a> Error for CompileError<'a> {} fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), CompileError> { match exit_status.success() { true => Ok(()), - false => Err(CompileError { - command: name, - exit_code: exit_status.code() - }) + false => Err(CompileError { command: name, exit_code: exit_status.code() }) } } fn main() -> Result<(), Box> { // remove the old libsais folder - Command::new("rm") - .args(["-rf", "libsais"]) - .status() - .unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it + Command::new("rm").args(["-rf", "libsais"]).status().unwrap_or_default(); // if removing fails, it is since the folder did not exist, we just can ignore it // clone the c library Command::new("git") @@ -73,15 +58,10 @@ fn main() -> Result<(), Box> { .expect("Failed to clone the libsais repository"); // compile the c library - Command::new("rm") - .args(["libsais/CMakeCache.txt"]) - .status() - .unwrap_or_default(); // if removing fails, it is since the cmake cache did not exist, we just can ignore it + Command::new("rm").args(["libsais/CMakeCache.txt"]).status().unwrap_or_default(); // if removing fails, it is since the cmake cache did not exist, we just can ignore it exit_status_to_result( "cmake", - Command::new("cmake") - .args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"]) - .status()? + Command::new("cmake").args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"]).status()? )?; exit_status_to_result("make", Command::new("make").args(["-C", "libsais"]).status()?)?; diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index e27cd93..e2a87f6 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -15,9 +15,7 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); /// Returns None if construction of the suffix array failed pub fn sais64(text: &[u8]) -> Option> { let mut sa = vec![0; text.len()]; - let exit_code = unsafe { - libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) - }; + let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) }; if exit_code == 0 { Some(sa) } else { None } } diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index 98ac63d..c0e13cd 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -1,9 +1,6 @@ use std::error::Error; -use clap::{ - Parser, - ValueEnum -}; +use clap::{Parser, ValueEnum}; /// Build a (sparse, compressed) suffix array from the given text #[derive(Parser, Debug)] @@ -11,23 +8,20 @@ pub struct Arguments { /// File with the proteins used to build the suffix tree. All the proteins are expected to be /// concatenated using a hashtag `#`. #[arg(short, long)] - pub database_file: String, - /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. - #[arg(short, long)] - pub taxonomy: String, + pub database_file: String, /// Output location where to store the suffix array #[arg(short, long)] - pub output: String, + pub output: String, /// The sparseness_factor used on the suffix array (default value 1, which means every value in /// the SA is used) #[arg(short, long, default_value_t = 1)] - pub sparseness_factor: u8, + pub sparseness_factor: u8, /// The algorithm used to construct the suffix array (default value LibSais) #[arg(short('a'), long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)] pub construction_algorithm: SAConstructionAlgorithm, /// If the suffix array should be compressed (default value true) #[arg(short, long, default_value_t = false)] - pub compress_sa: bool + pub compress_sa: bool } /// Enum representing the two possible algorithms to construct the suffix array @@ -103,7 +97,7 @@ fn sample_sa(sa: &mut Vec, sparseness_factor: u8) { } let mut current_sampled_index = 0; - for i in 0 .. sa.len() { + for i in 0..sa.len() { let current_sa_val = sa[i]; if current_sa_val % sparseness_factor as i64 == 0 { sa[current_sampled_index] = current_sa_val; @@ -125,8 +119,6 @@ mod tests { "sa-builder", "--database-file", "database.fa", - "--taxonomy", - "taxonomy.tsv", "--output", "output.fa", "--sparseness-factor", @@ -137,7 +129,6 @@ mod tests { ]); assert_eq!(args.database_file, "database.fa"); - assert_eq!(args.taxonomy, "taxonomy.tsv"); assert_eq!(args.output, "output.fa"); assert_eq!(args.sparseness_factor, 2); assert_eq!(args.construction_algorithm, SAConstructionAlgorithm::LibDivSufSort); @@ -150,10 +141,7 @@ mod tests { SAConstructionAlgorithm::from_str("lib-div-suf-sort", false), Ok(SAConstructionAlgorithm::LibDivSufSort) ); - assert_eq!( - SAConstructionAlgorithm::from_str("lib-sais", false), - Ok(SAConstructionAlgorithm::LibSais) - ); + assert_eq!(SAConstructionAlgorithm::from_str("lib-sais", false), Ok(SAConstructionAlgorithm::LibSais)); } #[test] diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index f458c47..98a1414 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,59 +1,28 @@ use std::{ - fs::{ - File, - OpenOptions - }, + fs::{File, OpenOptions}, io::BufWriter, - time::{ - SystemTime, - SystemTimeError, - UNIX_EPOCH - } + time::{SystemTime, SystemTimeError, UNIX_EPOCH} }; use clap::Parser; -use sa_builder::{ - build_ssa, - Arguments -}; +use sa_builder::{build_ssa, Arguments}; use sa_compression::dump_compressed_suffix_array; use sa_index::binary::dump_suffix_array; -use sa_mappings::{ - proteins::Proteins, - taxonomy::{ - AggregationMethod, - TaxonAggregator - } -}; +use sa_mappings::proteins::Proteins; fn main() { let Arguments { database_file, - taxonomy, output, sparseness_factor, construction_algorithm, compress_sa } = Arguments::parse(); - - eprintln!(); - eprintln!("📋 Started loading the taxon file..."); - let start_taxon_time = get_time_ms().unwrap(); - let taxon_id_calculator = - TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar) - .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); - eprintln!( - "✅ Successfully loaded the taxon file in {} seconds!", - (get_time_ms().unwrap() - start_taxon_time) / 1000.0 - ); - eprintln!("\tAggregation method: LCA*"); - eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - let mut data = - Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator) - .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + let mut data = Proteins::try_from_database_file_without_annotations(&database_file) + .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!( "✅ Successfully loaded the proteins in {} seconds!", (get_time_ms().unwrap() - start_proteins_time) / 1000.0 @@ -72,8 +41,8 @@ fn main() { eprintln!("\tSample rate: {}", sparseness_factor); // open the output file - let mut file = open_file_buffer(&output, 100 * 1024 * 1024) - .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); + let mut file = + open_file_buffer(&output, 100 * 1024 * 1024).unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!(); eprintln!("📋 Started dumping the suffix array..."); @@ -82,9 +51,7 @@ fn main() { if compress_sa { let bits_per_value = (data.len() as f64).log2().ceil() as usize; - if let Err(err) = - dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) - { + if let Err(err) = dump_compressed_suffix_array(sa, sparseness_factor, bits_per_value, &mut file) { eprint_and_exit(err.to_string().as_str()); }; diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index 85a41df..e9952a2 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -1,16 +1,9 @@ use std::{ error::Error, - io::{ - BufRead, - Write - } + io::{BufRead, Write} }; -use bitarray::{ - data_to_writer, - Binary, - BitArray -}; +use bitarray::{data_to_writer, Binary, BitArray}; use sa_index::SuffixArray; /// Writes the compressed suffix array to a writer. @@ -148,24 +141,19 @@ mod tests { let mut writer = vec![]; dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - // bits per value - 8, // sparseness factor - 1, // size of the suffix array - 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array - 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 - ] - ); + assert_eq!(writer, vec![ + // bits per value + 8, // sparseness factor + 1, // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 + ]); } #[test] #[should_panic(expected = "Could not write the required bits to the writer")] fn test_dump_compressed_suffix_array_fail_required_bits() { - let mut writer = FailingWriter { - valid_write_count: 0 - }; + let mut writer = FailingWriter { valid_write_count: 0 }; dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); } @@ -173,9 +161,7 @@ mod tests { #[test] #[should_panic(expected = "Could not write the sparseness factor to the writer")] fn test_dump_compressed_suffix_array_fail_sparseness_factor() { - let mut writer = FailingWriter { - valid_write_count: 1 - }; + let mut writer = FailingWriter { valid_write_count: 1 }; dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); } @@ -183,9 +169,7 @@ mod tests { #[test] #[should_panic(expected = "Could not write the size of the suffix array to the writer")] fn test_dump_compressed_suffix_array_fail_size() { - let mut writer = FailingWriter { - valid_write_count: 2 - }; + let mut writer = FailingWriter { valid_write_count: 2 }; dump_compressed_suffix_array(vec![], 1, 8, &mut writer).unwrap(); } @@ -193,9 +177,7 @@ mod tests { #[test] #[should_panic(expected = "Could not write the compressed suffix array to the writer")] fn test_dump_compressed_suffix_array_fail_compressed_suffix_array() { - let mut writer = FailingWriter { - valid_write_count: 3 - }; + let mut writer = FailingWriter { valid_write_count: 3 }; dump_compressed_suffix_array(vec![1], 1, 8, &mut writer).unwrap(); } @@ -213,7 +195,7 @@ mod tests { let compressed_suffix_array = load_compressed_suffix_array(&mut reader, 8).unwrap(); assert_eq!(compressed_suffix_array.sample_rate(), 1); - for i in 0 .. 10 { + for i in 0..10 { assert_eq!(compressed_suffix_array.get(i), i as i64 + 1); } } @@ -221,9 +203,7 @@ mod tests { #[test] #[should_panic(expected = "Could not read the sample rate from the binary file")] fn test_load_compressed_suffix_array_fail_sample_rate() { - let mut reader = FailingReader { - valid_read_count: 0 - }; + let mut reader = FailingReader { valid_read_count: 0 }; load_compressed_suffix_array(&mut reader, 8).unwrap(); } @@ -231,9 +211,7 @@ mod tests { #[test] #[should_panic(expected = "Could not read the size of the suffix array from the binary file")] fn test_load_compressed_suffix_array_fail_size() { - let mut reader = FailingReader { - valid_read_count: 1 - }; + let mut reader = FailingReader { valid_read_count: 1 }; load_compressed_suffix_array(&mut reader, 8).unwrap(); } @@ -241,27 +219,21 @@ mod tests { #[test] #[should_panic(expected = "Could not read the compressed suffix array from the binary file")] fn test_load_compressed_suffix_array_fail_compressed_suffix_array() { - let mut reader = FailingReader { - valid_read_count: 2 - }; + let mut reader = FailingReader { valid_read_count: 2 }; load_compressed_suffix_array(&mut reader, 8).unwrap(); } #[test] fn test_failing_writer() { - let mut writer = FailingWriter { - valid_write_count: 0 - }; + let mut writer = FailingWriter { valid_write_count: 0 }; assert!(writer.flush().is_ok()); assert!(writer.write(&[0]).is_err()); } #[test] fn test_failing_reader() { - let mut reader = FailingReader { - valid_read_count: 0 - }; + let mut reader = FailingReader { valid_read_count: 0 }; let right_buffer: [u8; 0] = []; assert_eq!(reader.fill_buf().unwrap(), &right_buffer); assert_eq!(reader.consume(0), ()); diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index 557549c..de57fc9 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -11,7 +11,6 @@ fa-compression = { path = "../fa-compression" } [dependencies] clap = { version = "4.4.8", features = ["derive"] } -umgap = "1.1.0" rayon = "1.8.1" serde = { version = "1.0.197", features = ["derive"] } sa-mappings = { path = "../sa-mappings" } diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index 5688d4a..55c082a 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -1,10 +1,6 @@ use std::{ error::Error, - io::{ - BufRead, - Read, - Write - } + io::{BufRead, Read, Write} }; use crate::SuffixArray; @@ -69,7 +65,7 @@ impl Binary for Vec { loop { let (finished, bytes_read) = fill_buffer(&mut reader, &mut buffer)?; - for buffer_slice in buffer[.. bytes_read].chunks_exact(8) { + for buffer_slice in buffer[..bytes_read].chunks_exact(8) { self.push(i64::from_le_bytes(buffer_slice.try_into().unwrap())); } @@ -93,16 +89,10 @@ impl Binary for Vec { /// # Returns /// /// Returns `Ok(())` if the write operation is successful, or an `Err` if an error occurs. -pub fn dump_suffix_array( - sa: &Vec, - sparseness_factor: u8, - writer: &mut impl Write -) -> Result<(), Box> { +pub fn dump_suffix_array(sa: &Vec, sparseness_factor: u8, writer: &mut impl Write) -> Result<(), Box> { // Write the required bits to the writer // 01000000 indicates that the suffix array is not compressed - writer - .write(&[64_u8]) - .map_err(|_| "Could not write the required bits to the writer")?; + writer.write(&[64_u8]).map_err(|_| "Could not write the required bits to the writer")?; // Write the sparseness factor to the writer writer @@ -116,8 +106,7 @@ pub fn dump_suffix_array( .map_err(|_| "Could not write the size of the suffix array to the writer")?; // Write the suffix array to the writer - sa.write_binary(writer) - .map_err(|_| "Could not write the suffix array to the writer")?; + sa.write_binary(writer).map_err(|_| "Could not write the suffix array to the writer")?; Ok(()) } @@ -150,8 +139,7 @@ pub fn load_suffix_array(reader: &mut impl BufRead) -> Result(input: &mut T, buffer: &mut Vec) -> std::io::Result< // No bytes written, which means we've completely filled the buffer // or we've reached the end of the file Ok(0) => { - return Ok(( - !writable_buffer_space.is_empty(), - buffer_size - writable_buffer_space.len() - )); + return Ok((!writable_buffer_space.is_empty(), buffer_size - writable_buffer_space.len())); } // We've read {bytes_read} bytes Ok(bytes_read) => { // Shrink the writable buffer slice - writable_buffer_space = writable_buffer_space[bytes_read ..].as_mut(); + writable_buffer_space = writable_buffer_space[bytes_read..].as_mut(); } // An error occurred while reading @@ -268,9 +253,7 @@ mod tests { #[test] fn test_fill_buffer_read_error() { - let mut input = FailingReader { - valid_read_count: 0 - }; + let mut input = FailingReader { valid_read_count: 0 }; let mut buffer = vec![0; 800]; assert!(fill_buffer(&mut input, &mut buffer).is_err()); @@ -283,20 +266,17 @@ mod tests { values.write_binary(&mut buffer).unwrap(); - assert_eq!( - buffer, - vec![ - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, - 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0 - ] - ); + assert_eq!(buffer, vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, + 0, 0, 0, 0 + ]); } #[test] fn test_read_binary() { let buffer = vec![ - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, - 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, + 0, 0, 0, 0, ]; let mut values = Vec::new(); @@ -312,25 +292,20 @@ mod tests { dump_suffix_array(&sa, 1, &mut buffer).unwrap(); - assert_eq!( - buffer, - vec![ - // required bits - 64, // Sparseness factor - 1, // Size of the suffix array - 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, - 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0 - ] - ); + assert_eq!(buffer, vec![ + // required bits + 64, // Sparseness factor + 1, // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, + 0, 0, 0, 0 + ]); } #[test] #[should_panic(expected = "Could not write the required bits to the writer")] fn test_dump_suffix_array_fail_required_bits() { - let mut writer = FailingWriter { - valid_write_count: 0 - }; + let mut writer = FailingWriter { valid_write_count: 0 }; dump_suffix_array(&vec![], 1, &mut writer).unwrap(); } @@ -338,9 +313,7 @@ mod tests { #[test] #[should_panic(expected = "Could not write the sparseness factor to the writer")] fn test_dump_suffix_array_fail_sparseness_factor() { - let mut writer = FailingWriter { - valid_write_count: 1 - }; + let mut writer = FailingWriter { valid_write_count: 1 }; dump_suffix_array(&vec![], 1, &mut writer).unwrap(); } @@ -348,9 +321,7 @@ mod tests { #[test] #[should_panic(expected = "Could not write the size of the suffix array to the writer")] fn test_dump_suffix_array_fail_size() { - let mut writer = FailingWriter { - valid_write_count: 2 - }; + let mut writer = FailingWriter { valid_write_count: 2 }; dump_suffix_array(&vec![], 1, &mut writer).unwrap(); } @@ -358,9 +329,7 @@ mod tests { #[test] #[should_panic(expected = "Could not write the suffix array to the writer")] fn test_dump_suffix_array_fail_suffix_array() { - let mut writer = FailingWriter { - valid_write_count: 3 - }; + let mut writer = FailingWriter { valid_write_count: 3 }; dump_suffix_array(&vec![1], 1, &mut writer).unwrap(); } @@ -371,15 +340,15 @@ mod tests { // Sample rate 1, // Size of the suffix array 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, - 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, + 0, 0, 0, 0, ]; let mut reader = buffer.as_slice(); let sa = load_suffix_array(&mut reader).unwrap(); assert_eq!(sa.sample_rate(), 1); - for i in 0 .. 5 { + for i in 0..5 { assert_eq!(sa.get(i), i as i64 + 1); } } @@ -387,9 +356,7 @@ mod tests { #[test] #[should_panic(expected = "Could not read the sample rate from the binary file")] fn test_load_suffix_array_fail_sample_rate() { - let mut reader = FailingReader { - valid_read_count: 0 - }; + let mut reader = FailingReader { valid_read_count: 0 }; load_suffix_array(&mut reader).unwrap(); } @@ -397,9 +364,7 @@ mod tests { #[test] #[should_panic(expected = "Could not read the size of the suffix array from the binary file")] fn test_load_suffix_array_fail_size() { - let mut reader = FailingReader { - valid_read_count: 1 - }; + let mut reader = FailingReader { valid_read_count: 1 }; load_suffix_array(&mut reader).unwrap(); } @@ -407,9 +372,7 @@ mod tests { #[test] #[should_panic(expected = "Could not read the suffix array from the binary file")] fn test_load_suffix_array_fail_suffix_array() { - let mut reader = FailingReader { - valid_read_count: 2 - }; + let mut reader = FailingReader { valid_read_count: 2 }; load_suffix_array(&mut reader).unwrap(); } diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index a8d2d67..55d629f 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -1,46 +1,32 @@ use rayon::prelude::*; -use sa_mappings::{ - functionality::FunctionalAggregation, - proteins::Protein -}; +use sa_mappings::proteins::Protein; use serde::Serialize; -use crate::sa_searcher::{ - SearchAllSuffixesResult, - Searcher -}; +use crate::sa_searcher::{SearchAllSuffixesResult, Searcher}; -/// Struct representing a collection of `SearchResultWithAnalysis` or `SearchOnlyResult` results #[derive(Debug, Serialize)] -pub struct OutputData { - pub result: Vec -} - -/// Struct representing the search result of the `sequence` in the index, including the analyses -#[derive(Debug, Serialize)] -pub struct SearchResultWithAnalysis { +pub struct SearchResult { pub sequence: String, - pub lca: Option, - pub taxa: Vec, - pub uniprot_accession_numbers: Vec, - pub fa: Option, - pub cutoff_used: bool -} - -/// Struct representing the search result of the `sequence` in the index (without the analyses) -#[derive(Debug, Serialize)] -pub struct SearchOnlyResult { - pub sequence: String, - pub proteins: Vec, + pub proteins: Vec, pub cutoff_used: bool } /// Struct that represents all information known about a certain protein in our database #[derive(Debug, Serialize)] pub struct ProteinInfo { - pub taxon: usize, - pub uniprot_accession: String, - pub functional_annotations: Vec + pub taxon: u32, + pub uniprot_accession: String, + pub functional_annotations: String +} + +impl From<&Protein> for ProteinInfo { + fn from(protein: &Protein) -> Self { + ProteinInfo { + taxon: protein.taxon_id, + uniprot_accession: protein.uniprot_id.clone(), + functional_annotations: protein.get_functional_annotations() + } + } } /// Searches the `peptide` in the index multithreaded and retrieves the matching proteins @@ -49,7 +35,7 @@ pub struct ProteinInfo { /// * `searcher` - The Searcher which contains the protein database /// * `peptide` - The peptide that is being searched in the index /// * `cutoff` - The maximum amount of matches we want to process from the index -/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search +/// * `equate_il` - Boolean indicating if we want to equate I and L during search /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the /// taxonomy /// @@ -64,179 +50,37 @@ pub fn search_proteins_for_peptide<'a>( searcher: &'a Searcher, peptide: &str, cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool + equate_il: bool ) -> Option<(bool, Vec<&'a Protein>)> { - let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase(); + let peptide = peptide.trim_end().to_uppercase(); // words that are shorter than the sample rate are not searchable if peptide.len() < searcher.sa.sample_rate() as usize { return None; } - let suffix_search = - searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equalize_i_and_l); - let mut cutoff_used = false; - let suffixes = match suffix_search { - SearchAllSuffixesResult::MaxMatches(matched_suffixes) => { - cutoff_used = true; - matched_suffixes - } - SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes, - SearchAllSuffixesResult::NoMatches => { - eprintln!("No matches found for peptide: {}", peptide); - return None; - } - }; + let suffix_search = searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equate_il); + let (suffixes, cutoff_used) = match suffix_search { + SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)), + SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)), + SearchAllSuffixesResult::NoMatches => None + }?; - let mut proteins = searcher.retrieve_proteins(&suffixes); - if clean_taxa { - proteins.retain(|protein| searcher.taxon_valid(protein)) - } + let proteins = searcher.retrieve_proteins(&suffixes); Some((cutoff_used, proteins)) } -/// Searches the `peptide` in the index multithreaded and retrieves the protein information from the -/// database This does NOT perform any of the analyses, it only retrieves the functional and -/// taxonomic annotations -/// -/// # Arguments -/// * `searcher` - The Searcher which contains the protein database -/// * `peptide` - The peptide that is being searched in the index -/// * `cutoff` - The maximum amount of matches we want to process from the index -/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search -/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the -/// taxonomy -/// -/// # Returns -/// -/// Returns Some(SearchOnlyResult) if the peptide has matches -/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the -/// sparseness factor k used in the index -pub fn search_peptide_retrieve_annotations( - searcher: &Searcher, - peptide: &str, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool -) -> Option { - let (cutoff_used, proteins) = - search_proteins_for_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)?; - - let annotations = searcher.get_all_functional_annotations(&proteins); - - let mut protein_info: Vec = vec![]; - for (&protein, annotations) in proteins.iter().zip(annotations) { - protein_info.push(ProteinInfo { - taxon: protein.taxon_id, - uniprot_accession: protein.uniprot_id.clone(), - functional_annotations: annotations - }) - } +pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_il: bool) -> Option { + let (cutoff_used, proteins) = search_proteins_for_peptide(searcher, peptide, cutoff, equate_il)?; - Some(SearchOnlyResult { + Some(SearchResult { sequence: peptide.to_string(), - proteins: protein_info, + proteins: proteins.iter().map(|&protein| protein.into()).collect(), cutoff_used }) } -/// Searches the `peptide` in the index multithreaded and performs the taxonomic and functional -/// analyses -/// -/// # Arguments -/// * `searcher` - The Searcher which contains the protein database -/// * `peptide` - The peptide that is being searched in the index -/// * `cutoff` - The maximum amount of matches we want to process from the index -/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search -/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the -/// taxonomy -/// -/// # Returns -/// -/// Returns Some(SearchResultWithAnalysis) if the peptide has matches -/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the -/// sparseness factor k used in the index -pub fn analyse_peptide( - searcher: &Searcher, - peptide: &str, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool -) -> Option { - let (cutoff_used, mut proteins) = - search_proteins_for_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)?; - - if clean_taxa { - proteins.retain(|protein| searcher.taxon_valid(protein)) - } - - // calculate the lca - let lca = if cutoff_used { - Some(1) - } else { - searcher.retrieve_lca(&proteins) - }; - - // return None if the LCA is none - lca?; - - let mut uniprot_accession_numbers = vec![]; - let mut taxa = vec![]; - - for protein in &proteins { - taxa.push(protein.taxon_id); - uniprot_accession_numbers.push(protein.uniprot_id.clone()); - } - - let fa = searcher.retrieve_function(&proteins); - // output the result - Some(SearchResultWithAnalysis { - sequence: peptide.to_string(), - lca, - cutoff_used, - uniprot_accession_numbers, - taxa, - fa - }) -} - -/// Searches the list of `peptides` in the index multithreaded and performs the functional and -/// taxonomic analyses -/// -/// # Arguments -/// * `searcher` - The Searcher which contains the protein database -/// * `peptides` - List of peptides we want to search in the index -/// * `cutoff` - The maximum amount of matches we want to process from the index -/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search -/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the -/// taxonomy -/// -/// # Returns -/// -/// Returns an `OutputData` object with the search and analyses results -/// for the peptides -pub fn analyse_all_peptides( - searcher: &Searcher, - peptides: &Vec, - cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool -) -> OutputData { - let res: Vec = peptides - .par_iter() - // calculate the results - .map(|peptide| analyse_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)) - // remove the None's - .filter_map(|search_result| search_result) - .collect(); - - OutputData { - result: res - } -} - /// Searches the list of `peptides` in the index and retrieves all related information about the /// found proteins This does NOT perform any of the analyses /// @@ -244,7 +88,7 @@ pub fn analyse_all_peptides( /// * `searcher` - The Searcher which contains the protein database /// * `peptides` - List of peptides we want to search in the index /// * `cutoff` - The maximum amount of matches we want to process from the index -/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search +/// * `equate_il` - Boolean indicating if we want to equate I and L during search /// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the /// taxonomy /// @@ -255,28 +99,12 @@ pub fn search_all_peptides( searcher: &Searcher, peptides: &Vec, cutoff: usize, - equalize_i_and_l: bool, - clean_taxa: bool -) -> OutputData { - let res: Vec = peptides + equate_il: bool +) -> Vec { + peptides .par_iter() - // calculate the results - .map(|peptide| { - search_peptide_retrieve_annotations( - searcher, - peptide, - cutoff, - equalize_i_and_l, - clean_taxa - ) - }) - // remove None's - .filter_map(|search_result| search_result) - .collect(); - - OutputData { - result: res - } + .filter_map(|peptide| search_peptide(searcher, peptide, cutoff, equate_il)) + .collect() } #[cfg(test)] @@ -290,54 +118,26 @@ mod tests { ); } - #[test] - fn test_serialize_output_data() { - let output_data = OutputData { - result: vec![1, 2, 3] - }; - - let generated_json = serde_json::to_string(&output_data).unwrap(); - let expected_json = "{\"result\":[1,2,3]}"; - - assert_json_eq(&generated_json, expected_json); - } - - #[test] - fn test_serialize_search_result_with_analysis() { - let search_result = SearchResultWithAnalysis { - sequence: "MSKIAALLPSV".to_string(), - lca: Some(1), - taxa: vec![1, 2, 3], - uniprot_accession_numbers: vec!["P12345".to_string(), "P23456".to_string()], - fa: None, - cutoff_used: true - }; - - let generated_json = serde_json::to_string(&search_result).unwrap(); - let expected_json = "{\"sequence\":\"MSKIAALLPSV\",\"lca\":1,\"taxa\":[1,2,3],\"uniprot_accession_numbers\":[\"P12345\",\"P23456\"],\"fa\":null,\"cutoff_used\":true}"; - - assert_json_eq(&generated_json, expected_json); - } - #[test] fn test_serialize_protein_info() { let protein_info = ProteinInfo { - taxon: 1, - uniprot_accession: "P12345".to_string(), - functional_annotations: vec!["GO:0001234".to_string(), "GO:0005678".to_string()] + taxon: 1, + uniprot_accession: "P12345".to_string(), + functional_annotations: "GO:0001234;GO:0005678".to_string() }; let generated_json = serde_json::to_string(&protein_info).unwrap(); - let expected_json = "{\"taxon\":1,\"uniprot_accession\":\"P12345\",\"functional_annotations\":[\"GO:0001234\",\"GO:0005678\"]}"; + let expected_json = + "{\"taxon\":1,\"uniprot_accession\":\"P12345\",\"functional_annotations\":\"GO:0001234;GO:0005678\"}"; assert_json_eq(&generated_json, expected_json); } #[test] - fn test_serialize_search_only_result() { - let search_result = SearchOnlyResult { - sequence: "MSKIAALLPSV".to_string(), - proteins: vec![], + fn test_serialize_search_result() { + let search_result = SearchResult { + sequence: "MSKIAALLPSV".to_string(), + proteins: vec![], cutoff_used: true }; diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 29bbc9a..d09c704 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,26 +1,11 @@ -use std::cmp::min; - -use sa_mappings::{ - functionality::{ - FunctionAggregator, - FunctionalAggregation - }, - proteins::{ - Protein, - Proteins - }, - taxonomy::TaxonAggregator -}; -use umgap::taxon::TaxonId; +use std::{cmp::min, ops::Deref}; + +use sa_mappings::proteins::{Protein, Proteins}; use crate::{ - sa_searcher::BoundSearch::{ - Maximum, - Minimum - }, - suffix_to_protein_index::SuffixToProteinIndex, - Nullable, - SuffixArray + sa_searcher::BoundSearch::{Maximum, Minimum}, + suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}, + Nullable, SuffixArray }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array @@ -74,20 +59,54 @@ impl PartialEq for SearchAllSuffixesResult { } match (self, other) { - ( - SearchAllSuffixesResult::MaxMatches(arr1), - SearchAllSuffixesResult::MaxMatches(arr2) - ) => array_eq_unordered(arr1, arr2), - ( - SearchAllSuffixesResult::SearchResult(arr1), - SearchAllSuffixesResult::SearchResult(arr2) - ) => array_eq_unordered(arr1, arr2), + (SearchAllSuffixesResult::MaxMatches(arr1), SearchAllSuffixesResult::MaxMatches(arr2)) => { + array_eq_unordered(arr1, arr2) + } + (SearchAllSuffixesResult::SearchResult(arr1), SearchAllSuffixesResult::SearchResult(arr2)) => { + array_eq_unordered(arr1, arr2) + } (SearchAllSuffixesResult::NoMatches, SearchAllSuffixesResult::NoMatches) => true, _ => false } } } +pub struct SparseSearcher(Searcher); + +impl SparseSearcher { + pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + Self(searcher) + } +} + +impl Deref for SparseSearcher { + type Target = Searcher; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +pub struct DenseSearcher(Searcher); + +impl DenseSearcher { + pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { + let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); + Self(searcher) + } +} + +impl Deref for DenseSearcher { + type Target = Searcher; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// Struct that contains all the elements needed to search a peptide in the suffix array /// This struct also contains all the functions used for search /// @@ -102,10 +121,8 @@ impl PartialEq for SearchAllSuffixesResult { /// the functional analysis provided by Unipept pub struct Searcher { pub sa: SuffixArray, - pub suffix_index_to_protein: Box, pub proteins: Proteins, - pub taxon_id_calculator: TaxonAggregator, - pub function_aggregator: FunctionAggregator + pub suffix_index_to_protein: Box } impl Searcher { @@ -125,20 +142,8 @@ impl Searcher { /// # Returns /// /// Returns a new Searcher object - pub fn new( - sa: SuffixArray, - suffix_index_to_protein: Box, - proteins: Proteins, - taxon_id_calculator: TaxonAggregator, - function_aggregator: FunctionAggregator - ) -> Self { - Self { - sa, - suffix_index_to_protein, - proteins, - taxon_id_calculator, - function_aggregator - } + pub fn new(sa: SuffixArray, proteins: Proteins, suffix_index_to_protein: Box) -> Self { + Self { sa, proteins, suffix_index_to_protein } } /// Compares the `search_string` to the `suffix` @@ -158,13 +163,7 @@ impl Searcher { /// The first argument is true if `bound` == `Minimum` and `search_string` <= `suffix` or if /// `bound` == `Maximum` and `search_string` >= `suffix` The second argument indicates how /// far the `suffix` and `search_string` matched - fn compare( - &self, - search_string: &[u8], - suffix: i64, - skip: usize, - bound: BoundSearch - ) -> (bool, usize) { + fn compare(&self, search_string: &[u8], suffix: i64, skip: usize, bound: BoundSearch) -> (bool, usize) { let mut index_in_suffix = (suffix as usize) + skip; let mut index_in_search_string = skip; let mut is_cond_or_equal = false; @@ -178,8 +177,7 @@ impl Searcher { // match as long as possible while index_in_search_string < search_string.len() && index_in_suffix < self.proteins.input_string.len() - && (search_string[index_in_search_string] - == self.proteins.input_string[index_in_suffix] + && (search_string[index_in_search_string] == self.proteins.input_string[index_in_suffix] || (search_string[index_in_search_string] == b'L' && self.proteins.input_string[index_in_suffix] == b'I') || (search_string[index_in_search_string] == b'I' @@ -238,8 +236,7 @@ impl Searcher { while right - left > 1 { let center = (left + right) / 2; let skip = min(lcp_left, lcp_right); - let (retval, lcp_center) = - self.compare(search_string, self.sa.get(center), skip, bound); + let (retval, lcp_center) = self.compare(search_string, self.sa.get(center), skip, bound); found |= lcp_center == search_string.len(); @@ -256,8 +253,7 @@ impl Searcher { // handle edge case to search at index 0 if right == 1 && left == 0 { - let (retval, lcp_center) = - self.compare(search_string, self.sa.get(0), min(lcp_left, lcp_right), bound); + let (retval, lcp_center) = self.compare(search_string, self.sa.get(0), min(lcp_left, lcp_right), bound); found |= lcp_center == search_string.len(); @@ -300,7 +296,7 @@ impl Searcher { /// * `search_string` - The string/peptide we are searching in the suffix array /// * `max_matches` - The maximum amount of matches processed, if more matches are found we /// don't process them - /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// * `equate_il` - True if we want to equate I and L during search, otherwise false /// /// # Returns /// @@ -310,7 +306,7 @@ impl Searcher { &self, search_string: &[u8], max_matches: usize, - equalize_i_and_l: bool + equate_il: bool ) -> SearchAllSuffixesResult { let mut matching_suffixes: Vec = vec![]; let mut il_locations = vec![]; @@ -323,14 +319,13 @@ impl Searcher { let mut skip: usize = 0; while skip < self.sa.sample_rate() as usize { let mut il_locations_start = 0; - while il_locations_start < il_locations.len() && il_locations[il_locations_start] < skip - { + while il_locations_start < il_locations.len() && il_locations[il_locations_start] < skip { il_locations_start += 1; } - let il_locations_current_suffix = &il_locations[il_locations_start ..]; - let current_search_string_prefix = &search_string[.. skip]; - let current_search_string_suffix = &search_string[skip ..]; - let search_bound_result = self.search_bounds(&search_string[skip ..]); + let il_locations_current_suffix = &il_locations[il_locations_start..]; + let current_search_string_prefix = &search_string[..skip]; + let current_search_string_suffix = &search_string[skip..]; + let search_bound_result = self.search_bounds(&search_string[skip..]); // if the shorter part is matched, see if what goes before the matched suffix matches // the unmatched part of the prefix if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result { @@ -347,16 +342,15 @@ impl Searcher { && ((skip == 0 || Self::check_prefix( current_search_string_prefix, - &self.proteins.input_string[suffix - skip .. suffix], - equalize_i_and_l + &self.proteins.input_string[suffix - skip..suffix], + equate_il )) && Self::check_suffix( skip, il_locations_current_suffix, current_search_string_suffix, - &self.proteins.input_string - [suffix .. suffix + search_string.len() - skip], - equalize_i_and_l + &self.proteins.input_string[suffix..suffix + search_string.len() - skip], + equate_il )) { matching_suffixes.push((suffix - skip) as i64); @@ -380,39 +374,33 @@ impl Searcher { } /// Returns true of the prefixes are the same - /// if `equalize_i_and_l` is set to true, L and I are considered the same + /// if `equate_il` is set to true, L and I are considered the same /// /// # Arguments /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched /// * `index_prefix` - The unchecked prefix from the protein from the suffix array - /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// * `equate_il` - True if we want to equate I and L during search, otherwise false /// /// # Returns /// /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise /// false #[inline] - fn check_prefix( - search_string_prefix: &[u8], - index_prefix: &[u8], - equalize_i_and_l: bool - ) -> bool { - if equalize_i_and_l { - search_string_prefix.iter().zip(index_prefix).all( - |(&search_character, &index_character)| { - search_character == index_character - || (search_character == b'I' && index_character == b'L') - || (search_character == b'L' && index_character == b'I') - } - ) + fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool { + if equate_il { + search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| { + search_character == index_character + || (search_character == b'I' && index_character == b'L') + || (search_character == b'L' && index_character == b'I') + }) } else { search_string_prefix == index_prefix } } /// Returns true of the search_string and index_string are equal - /// This is automatically true if `equalize_i_and_l` is set to true, since there matched during - /// search where I = L If `equalize_i_and_l` is set to false, we need to check if the I and + /// This is automatically true if `equate_il` is set to true, since there matched during + /// search where I = L If `equate_il` is set to false, we need to check if the I and /// L locations have the same character /// /// # Arguments @@ -422,7 +410,7 @@ impl Searcher { /// removed from it /// * `index_string` - The suffix that search_string matches with when I and L were equalized /// during search - /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false + /// * `equate_il` - True if we want to equate I and L during search, otherwise false /// /// # Returns /// @@ -432,9 +420,9 @@ impl Searcher { il_locations: &[usize], search_string: &[u8], index_string: &[u8], - equalize_i_and_l: bool + equate_il: bool ) -> bool { - if equalize_i_and_l { + if equate_il { true } else { for &il_location in il_locations { @@ -466,113 +454,14 @@ impl Searcher { } res } - - /// Searches all the matching proteins for a search_string/peptide in the suffix array - /// - /// # Arguments - /// * `search_string` - The string/peptide being searched - /// * `equalize_i_and_l` - If set to true, I and L are equalized during search - /// - /// # Returns - /// - /// Returns the matching proteins for the search_string - pub fn search_proteins_for_peptide( - &self, - search_string: &[u8], - equalize_i_and_l: bool - ) -> Vec<&Protein> { - let mut matching_suffixes = vec![]; - if let SearchAllSuffixesResult::SearchResult(suffixes) = - self.search_matching_suffixes(search_string, usize::MAX, equalize_i_and_l) - { - matching_suffixes = suffixes; - } - self.retrieve_proteins(&matching_suffixes) - } - - /// Retrieves the taxonomic analysis for a collection of proteins - /// - /// # Arguments - /// * `proteins` - A collection of proteins - /// - /// # Returns - /// - /// Returns the taxonomic analysis result for the given list of proteins - #[inline] - pub fn retrieve_lca(&self, proteins: &[&Protein]) -> Option { - let taxon_ids: Vec = proteins.iter().map(|prot| prot.taxon_id).collect(); - - self.taxon_id_calculator - .aggregate(taxon_ids) - .map(|id| self.taxon_id_calculator.snap_taxon(id)) - } - - /// Returns true if the protein is considered valid by the provided taxonomy file - /// - /// # Arguments - /// * `protein` - A protein of which we want to check the validity - /// - /// # Returns - /// - /// Returns true if the protein is considered valid by the provided taxonomy file - pub fn taxon_valid(&self, protein: &Protein) -> bool { - self.taxon_id_calculator.taxon_valid(protein.taxon_id) - } - - /// Retrieves the functional analysis for a collection of proteins - /// - /// # Arguments - /// * `proteins` - A collection of proteins - /// - /// # Returns - /// - /// Returns the functional analysis result for the given list of proteins - pub fn retrieve_function(&self, proteins: &[&Protein]) -> Option { - let res = self.function_aggregator.aggregate(proteins.to_vec()); - Some(res) - } - - /// Retrieves the all the functional annotations for a collection of proteins - /// - /// # Arguments - /// * `proteins` - A collection of proteins - /// - /// # Returns - /// - /// Returns all functional annotations for a collection of proteins - pub fn get_all_functional_annotations(&self, proteins: &[&Protein]) -> Vec> { - self.function_aggregator - .get_all_functional_annotations(proteins) - } } #[cfg(test)] mod tests { - use std::{ - fs::File, - io::Write, - path::PathBuf - }; - - use sa_mappings::{ - functionality::FunctionAggregator, - proteins::{ - Protein, - Proteins - }, - taxonomy::{ - AggregationMethod, - TaxonAggregator - } - }; - use tempdir::TempDir; + use sa_mappings::proteins::{Protein, Proteins}; use crate::{ - sa_searcher::{ - BoundSearchResult, - SearchAllSuffixesResult, - Searcher - }, + sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, suffix_to_protein_index::SparseSuffixToProtein, SuffixArray }; @@ -597,51 +486,29 @@ mod tests { assert_ne!(search_all_suffixes_result_4, search_all_suffixes_result_7); } - fn create_taxonomy_file(tmp_dir: &TempDir) -> PathBuf { - let taxonomy_file = tmp_dir.path().join("taxonomy.tsv"); - let mut file = File::create(&taxonomy_file).unwrap(); - - writeln!(file, "1\troot\tno rank\t1\t\x01").unwrap(); - writeln!(file, "2\tBacteria\tsuperkingdom\t1\t\x01").unwrap(); - writeln!(file, "6\tAzorhizobium\tgenus\t1\t\x01").unwrap(); - writeln!(file, "7\tAzorhizobium caulinodans\tspecies\t6\t\x01").unwrap(); - writeln!(file, "9\tBuchnera aphidicola\tspecies\t6\t\x01").unwrap(); - writeln!(file, "10\tCellvibrio\tgenus\t6\t\x01").unwrap(); - writeln!(file, "11\tCellulomonas gilvus\tspecies\t10\t\x01").unwrap(); - writeln!(file, "13\tDictyoglomus\tgenus\t11\t\x01").unwrap(); - writeln!(file, "14\tDictyoglomus thermophilum\tspecies\t10\t\x01").unwrap(); - writeln!(file, "16\tMethylophilus\tgenus\t14\t\x01").unwrap(); - writeln!(file, "17\tMethylophilus methylotrophus\tspecies\t16\t\x01").unwrap(); - writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap(); - writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap(); - writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap(); - - taxonomy_file - } - fn get_example_proteins() -> Proteins { let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes(); Proteins { input_string: text, - proteins: vec![ + proteins: vec![ Protein { - uniprot_id: String::new(), - taxon_id: 0, + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }, Protein { - uniprot_id: String::new(), - taxon_id: 0, + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }, Protein { - uniprot_id: String::new(), - taxon_id: 0, + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }, Protein { - uniprot_id: String::new(), - taxon_id: 0, + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }, ] @@ -651,25 +518,10 @@ mod tests { #[test] fn test_search_simple() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original( - vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], - 1 - ); - - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let searcher = Searcher::new( - sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); + + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'A' let bounds_res = searcher.search_bounds(&[b'A']); @@ -689,24 +541,11 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let searcher = Searcher::new( - sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search suffix 'VAA' - let found_suffixes = - searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false); + let found_suffixes = searcher.search_matching_suffixes(&[b'V', b'A', b'A'], usize::MAX, false); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![7])); // search suffix 'AC' @@ -717,25 +556,10 @@ mod tests { #[test] fn test_il_equality() { let proteins = get_example_proteins(); - let sa = SuffixArray::Original( - vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], - 1 - ); - - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let searcher = Searcher::new( - sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); + + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); let bounds_res = searcher.search_bounds(&[b'I']); assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16))); @@ -750,29 +574,15 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let searcher = Searcher::new( - sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'RIZ' with equal I and L - let found_suffixes = - searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16])); // search bounds 'RIZ' without equal I and L - let found_suffixes = - searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false); assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches); } @@ -783,28 +593,16 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }] }; - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); - let searcher = Searcher::new( - sparse_sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'M'], usize::MAX, true); @@ -817,28 +615,16 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }] }; - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); - let searcher = Searcher::new( - sparse_sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![2, 3, 4, 5])); @@ -850,28 +636,16 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }] }; - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let searcher = Searcher::new( - sparse_sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![0, 1, 2, 3, 4])); @@ -883,28 +657,16 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }] }; - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); - let searcher = Searcher::new( - sparse_sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search all places where II is in the string IIIILL, but with a sparse SA // this way we check if filtering the suffixes works as expected @@ -918,28 +680,16 @@ mod tests { let proteins = Proteins { input_string: text, - proteins: vec![Protein { - uniprot_id: String::new(), - taxon_id: 0, + proteins: vec![Protein { + uniprot_id: String::new(), + taxon_id: 0, functional_annotations: vec![] }] }; - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let searcher = Searcher::new( - sparse_sa, - Box::new(SparseSuffixToProtein::new(&proteins.input_string)), - proteins, - TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(), - FunctionAggregator {} - ); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 0091fed..121b569 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -1,8 +1,5 @@ use clap::ValueEnum; -use sa_mappings::proteins::{ - SEPARATION_CHARACTER, - TERMINATION_CHARACTER -}; +use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; use crate::Nullable; @@ -49,10 +46,7 @@ impl SuffixToProteinIndex for DenseSuffixToProtein { impl SuffixToProteinIndex for SparseSuffixToProtein { fn suffix_to_protein(&self, suffix: i64) -> u32 { - let protein_index = self - .mapping - .binary_search(&suffix) - .unwrap_or_else(|index| index - 1); + let protein_index = self.mapping.binary_search(&suffix).unwrap_or_else(|index| index - 1); // if the next value in the mapping is 1 larger than the current suffix, that means that the // current suffix starts with a SEPARATION_CHARACTER or TERMINATION_CHARACTER // this means it does not belong to a protein @@ -85,9 +79,7 @@ impl DenseSuffixToProtein { } } suffix_index_to_protein.shrink_to_fit(); - DenseSuffixToProtein { - mapping: suffix_index_to_protein - } + DenseSuffixToProtein { mapping: suffix_index_to_protein } } } @@ -108,26 +100,18 @@ impl SparseSuffixToProtein { } } suffix_index_to_protein.shrink_to_fit(); - SparseSuffixToProtein { - mapping: suffix_index_to_protein - } + SparseSuffixToProtein { mapping: suffix_index_to_protein } } } #[cfg(test)] mod tests { use clap::ValueEnum; - use sa_mappings::proteins::{ - SEPARATION_CHARACTER, - TERMINATION_CHARACTER - }; + use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; use crate::{ suffix_to_protein_index::{ - DenseSuffixToProtein, - SparseSuffixToProtein, - SuffixToProteinIndex, - SuffixToProteinMappingStyle + DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle }, Nullable }; @@ -140,10 +124,7 @@ mod tests { #[test] fn test_suffix_to_protein_mapping_style() { - assert_eq!( - SuffixToProteinMappingStyle::Dense, - SuffixToProteinMappingStyle::from_str("dense", false).unwrap() - ); + assert_eq!(SuffixToProteinMappingStyle::Dense, SuffixToProteinMappingStyle::from_str("dense", false).unwrap()); assert_eq!( SuffixToProteinMappingStyle::Sparse, SuffixToProteinMappingStyle::from_str("sparse", false).unwrap() @@ -164,9 +145,7 @@ mod tests { fn test_sparse_build() { let u8_text = &build_text(); let index = SparseSuffixToProtein::new(u8_text); - let expected = SparseSuffixToProtein { - mapping: vec![0, 4, 7, 11] - }; + let expected = SparseSuffixToProtein { mapping: vec![0, 4, 7, 11] }; assert_eq!(index, expected); } diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml index 6573fb0..b20a2bf 100644 --- a/sa-mappings/Cargo.toml +++ b/sa-mappings/Cargo.toml @@ -11,7 +11,3 @@ tempdir = "0.3.7" [dependencies] fa-compression = { path = "../fa-compression" } bytelines = "2.5.0" -umgap = "1.1.0" -serde_json = "1.0.115" -serde = { version = "1.0.197", features = ["derive"] } - diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs deleted file mode 100644 index 7a64150..0000000 --- a/sa-mappings/src/functionality.rs +++ /dev/null @@ -1,223 +0,0 @@ -//! This module contains the FunctionAggregator struct that is responsible for aggregating the -//! functional annotations of proteins. - -use std::collections::{ - HashMap, - HashSet -}; - -use serde::Serialize; - -use crate::proteins::Protein; - -/// A struct that represents the functional annotations once aggregated -#[derive(Debug, Serialize)] -pub struct FunctionalAggregation { - /// A HashMap representing how many GO, EC and IPR terms were found - pub counts: HashMap, - /// A HashMap representing how often a certain functional annotation was found - pub data: HashMap -} - -/// A struct that represents a function aggregator -pub struct FunctionAggregator {} - -impl FunctionAggregator { - /// Aggregates the functional annotations of proteins - /// - /// # Arguments - /// * `proteins` - A vector of proteins - /// - /// # Returns - /// - /// Returns a JSON string containing the aggregated functional annotations - pub fn aggregate(&self, proteins: Vec<&Protein>) -> FunctionalAggregation { - // Keep track of the proteins that have any annotation - let mut proteins_with_annotations: HashSet = HashSet::new(); - - // Keep track of the proteins that have a certain annotation - let mut proteins_with_ec: HashSet = HashSet::new(); - let mut proteins_with_go: HashSet = HashSet::new(); - let mut proteins_with_ipr: HashSet = HashSet::new(); - - // Keep track of the counts of the different annotations - let mut data: HashMap = HashMap::new(); - - for protein in proteins.iter() { - for annotation in protein.get_functional_annotations().split(';') { - match annotation.chars().next() { - Some('E') => { - proteins_with_ec.insert(protein.uniprot_id.clone()); - proteins_with_annotations.insert(protein.uniprot_id.clone()); - } - Some('G') => { - proteins_with_go.insert(protein.uniprot_id.clone()); - proteins_with_annotations.insert(protein.uniprot_id.clone()); - } - Some('I') => { - proteins_with_ipr.insert(protein.uniprot_id.clone()); - proteins_with_annotations.insert(protein.uniprot_id.clone()); - } - _ => {} - }; - - data.entry(annotation.to_string()) - .and_modify(|c| *c += 1) - .or_insert(1); - } - } - - let mut counts: HashMap = HashMap::new(); - counts.insert("all".to_string(), proteins_with_annotations.len()); - counts.insert("EC".to_string(), proteins_with_ec.len()); - counts.insert("GO".to_string(), proteins_with_go.len()); - counts.insert("IPR".to_string(), proteins_with_ipr.len()); - - data.remove(""); - - FunctionalAggregation { - counts, - data - } - } - - /// Aggregates the functional annotations of proteins - /// - /// # Arguments - /// * `proteins` - A vector of proteins - /// - /// # Returns - /// - /// Returns a list of lists with all the functional annotations per protein - pub fn get_all_functional_annotations(&self, proteins: &[&Protein]) -> Vec> { - proteins - .iter() - .map(|&prot| { - prot.get_functional_annotations() - .split(';') - .map(|ann| ann.to_string()) - .filter(|s| !s.is_empty()) - .collect() - }) - .collect::>>() - } -} - -#[cfg(test)] -mod tests { - use fa_compression::algorithm1::encode; - - use super::*; - - #[test] - fn test_aggregate() { - let mut proteins: Vec = Vec::new(); - - let protein1 = Protein { - uniprot_id: "P12345".to_string(), - taxon_id: 9606, - functional_annotations: encode("GO:0001234;GO:0005678") - }; - let protein2 = Protein { - uniprot_id: "P23456".to_string(), - taxon_id: 9606, - functional_annotations: encode("EC:1.1.1.-") - }; - let protein3 = Protein { - uniprot_id: "P23876".to_string(), - taxon_id: 9606, - functional_annotations: encode("IPR:IPR123456;EC:1.1.1.-") - }; - let protein4 = Protein { - uniprot_id: "P23877".to_string(), - taxon_id: 9606, - functional_annotations: encode("2345") - }; - - proteins.push(protein1); - proteins.push(protein2); - proteins.push(protein3); - proteins.push(protein4); - - let function_aggregator = FunctionAggregator {}; - - let result = function_aggregator.aggregate(proteins.iter().collect()); - - assert_eq!(result.counts.get("all"), Some(&3)); - assert_eq!(result.counts.get("EC"), Some(&2)); - assert_eq!(result.counts.get("GO"), Some(&1)); - assert_eq!(result.counts.get("IPR"), Some(&1)); - assert_eq!(result.counts.get("NOTHING"), None); - - assert_eq!(result.data, { - let mut map = HashMap::new(); - map.insert("GO:0001234".to_string(), 1); - map.insert("GO:0005678".to_string(), 1); - map.insert("EC:1.1.1.-".to_string(), 2); - map.insert("IPR:IPR123456".to_string(), 1); - map - }); - assert_eq!(result.data.get("EC:1.1.2.-"), None); - } - - #[test] - fn test_get_all_functional_annotations() { - let mut proteins: Vec<&Protein> = Vec::new(); - - let protein1 = Protein { - uniprot_id: "P12345".to_string(), - taxon_id: 9606, - functional_annotations: encode("GO:0001234;GO:0005678") - }; - let protein2 = Protein { - uniprot_id: "P23456".to_string(), - taxon_id: 9606, - functional_annotations: encode("EC:1.1.1.-") - }; - let protein3 = Protein { - uniprot_id: "P23876".to_string(), - taxon_id: 9606, - functional_annotations: encode("IPR:IPR123456;EC:1.1.1.-") - }; - - proteins.push(&protein1); - proteins.push(&protein2); - proteins.push(&protein3); - - let function_aggregator = FunctionAggregator {}; - - let result = function_aggregator.get_all_functional_annotations(proteins.as_slice()); - - assert_eq!(result.len(), 3); - assert_eq!(result[0].len(), 2); - assert_eq!(result[1].len(), 1); - assert_eq!(result[2].len(), 2); - } - - #[test] - fn test_serialize_functional_aggregation() { - let mut proteins: Vec = Vec::new(); - proteins.push(Protein { - uniprot_id: "P12345".to_string(), - taxon_id: 9606, - functional_annotations: encode("GO:0001234;GO:0005678") - }); - proteins.push(Protein { - uniprot_id: "P23456".to_string(), - taxon_id: 9606, - functional_annotations: encode("EC:1.1.1.-") - }); - - let function_aggregator = FunctionAggregator {}; - - let result = function_aggregator.aggregate(proteins.iter().collect()); - - let generated_json = serde_json::to_string(&result).unwrap(); - let expected_json = "{\"counts\":{\"all\":2,\"GO\":1,\"EC\":1,\"IPR\":0},\"data\":{\"GO:0001234\":1,\"GO:0005678\":1,\"EC:1.1.1.-\":1}}"; - - assert_eq!( - generated_json.parse::().unwrap(), - expected_json.parse::().unwrap(), - ); - } -} diff --git a/sa-mappings/src/lib.rs b/sa-mappings/src/lib.rs index 6986c13..0d53d38 100644 --- a/sa-mappings/src/lib.rs +++ b/sa-mappings/src/lib.rs @@ -3,6 +3,4 @@ #![warn(missing_docs)] -pub mod functionality; pub mod proteins; -pub mod taxonomy; diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index e1d2f49..f2b24cc 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -1,22 +1,10 @@ //! This module contains the `Protein` and `Proteins` structs, which are used to represent proteins //! and collections of proteins, respectively. -use std::{ - error::Error, - fs::File, - io::BufReader, - ops::Index, - str::from_utf8 -}; +use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8}; use bytelines::ByteLines; -use fa_compression::algorithm1::{ - decode, - encode -}; -use umgap::taxon::TaxonId; - -use crate::taxonomy::TaxonAggregator; +use fa_compression::algorithm1::{decode, encode}; /// The separation character used in the input string pub static SEPARATION_CHARACTER: u8 = b'-'; @@ -31,7 +19,7 @@ pub struct Protein { pub uniprot_id: String, /// the taxon id of the protein - pub taxon_id: TaxonId, + pub taxon_id: u32, /// The encoded functional annotations of the protein pub functional_annotations: Vec @@ -67,10 +55,7 @@ impl Proteins { /// # Errors /// /// Returns a `Box` if an error occurred while reading the database file - pub fn try_from_database_file( - file: &str, - taxon_aggregator: &TaxonAggregator - ) -> Result> { + pub fn try_from_database_file(file: &str) -> Result> { let mut input_string: String = String::new(); let mut proteins: Vec = Vec::new(); @@ -85,14 +70,10 @@ impl Proteins { // uniprot_id, taxon_id and sequence should always contain valid utf8 let uniprot_id = from_utf8(fields.next().unwrap())?; - let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; + let taxon_id = from_utf8(fields.next().unwrap())?.parse()?; let sequence = from_utf8(fields.next().unwrap())?; let functional_annotations: Vec = encode(from_utf8(fields.next().unwrap())?); - if !taxon_aggregator.taxon_exists(taxon_id) { - continue; - } - input_string.push_str(&sequence.to_uppercase()); input_string.push(SEPARATION_CHARACTER.into()); @@ -107,10 +88,7 @@ impl Proteins { input_string.push(TERMINATION_CHARACTER.into()); input_string.shrink_to_fit(); proteins.shrink_to_fit(); - Ok(Self { - input_string: input_string.into_bytes(), - proteins - }) + Ok(Self { input_string: input_string.into_bytes(), proteins }) } /// Creates a `vec` which represents all the proteins concatenated from the database file @@ -126,10 +104,7 @@ impl Proteins { /// # Errors /// /// Returns a `Box` if an error occurred while reading the database file - pub fn try_from_database_file_without_annotations( - database_file: &str, - taxon_aggregator: &TaxonAggregator - ) -> Result, Box> { + pub fn try_from_database_file_without_annotations(database_file: &str) -> Result, Box> { let mut input_string: String = String::new(); let file = File::open(database_file)?; @@ -142,14 +117,7 @@ impl Proteins { let mut fields = line.split(|b| *b == b'\t'); // only get the taxon id and sequence from each line, we don't need the other parts - fields.next(); - let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; - let sequence = from_utf8(fields.next().unwrap())?; - fields.next(); - - if !taxon_aggregator.taxon_exists(taxon_id) { - continue; - } + let sequence = from_utf8(fields.nth(2).unwrap())?; input_string.push_str(&sequence.to_uppercase()); input_string.push(SEPARATION_CHARACTER.into()); @@ -173,67 +141,36 @@ impl Index for Proteins { #[cfg(test)] mod tests { - use std::{ - fs::File, - io::Write, - path::PathBuf - }; + use std::{fs::File, io::Write, path::PathBuf}; use tempdir::TempDir; use super::*; - use crate::taxonomy::AggregationMethod; fn create_database_file(tmp_dir: &TempDir) -> PathBuf { let database_file = tmp_dir.path().join("database.tsv"); let mut file = File::create(&database_file).unwrap(); + file.write("P12345\t1\tMLPGLALLLLAAWTARALEV\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes()) + .unwrap(); + file.write("P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes()) + .unwrap(); + file.write("P67890\t6\tKWDSDPSGTKTCIDT\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes()) + .unwrap(); file.write( - "P12345\t1\tMLPGLALLLLAAWTARALEV\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() - ) - .unwrap(); - file.write( - "P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n" + "P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n" .as_bytes() ) .unwrap(); - file.write( - "P67890\t6\tKWDSDPSGTKTCIDT\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes() - ) - .unwrap(); - file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n".as_bytes()) - .unwrap(); database_file } - fn create_taxonomy_file(tmp_dir: &TempDir) -> PathBuf { - let taxonomy_file = tmp_dir.path().join("taxonomy.tsv"); - let mut file = File::create(&taxonomy_file).unwrap(); - - writeln!(file, "1\troot\tno rank\t1\t\x01").unwrap(); - writeln!(file, "2\tBacteria\tsuperkingdom\t1\t\x01").unwrap(); - writeln!(file, "6\tAzorhizobium\tgenus\t1\t\x01").unwrap(); - writeln!(file, "7\tAzorhizobium caulinodans\tspecies\t6\t\x01").unwrap(); - writeln!(file, "9\tBuchnera aphidicola\tspecies\t6\t\x01").unwrap(); - writeln!(file, "10\tCellvibrio\tgenus\t6\t\x01").unwrap(); - writeln!(file, "11\tCellulomonas gilvus\tspecies\t10\t\x01").unwrap(); - writeln!(file, "13\tDictyoglomus\tgenus\t11\t\x01").unwrap(); - writeln!(file, "14\tDictyoglomus thermophilum\tspecies\t10\t\x01").unwrap(); - writeln!(file, "16\tMethylophilus\tgenus\t14\t\x01").unwrap(); - writeln!(file, "17\tMethylophilus methylotrophus\tspecies\t16\t\x01").unwrap(); - writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap(); - writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap(); - writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap(); - - taxonomy_file - } - #[test] fn test_new_protein() { let protein = Protein { - uniprot_id: "P12345".to_string(), - taxon_id: 1, + uniprot_id: "P12345".to_string(), + taxon_id: 1, functional_annotations: vec![0xD1, 0x11] }; @@ -245,27 +182,22 @@ mod tests { #[test] fn test_new_proteins() { let proteins = Proteins { - input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG" - .as_bytes() - .to_vec(), - proteins: vec![ + input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes().to_vec(), + proteins: vec![ Protein { - uniprot_id: "P12345".to_string(), - taxon_id: 1, + uniprot_id: "P12345".to_string(), + taxon_id: 1, functional_annotations: vec![0xD1, 0x11] }, Protein { - uniprot_id: "P54321".to_string(), - taxon_id: 2, + uniprot_id: "P54321".to_string(), + taxon_id: 2, functional_annotations: vec![0xD1, 0x11] }, ] }; - assert_eq!( - proteins.input_string, - "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes() - ); + assert_eq!(proteins.input_string, "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes()); assert_eq!(proteins.proteins.len(), 2); assert_eq!(proteins[0].uniprot_id, "P12345"); assert_eq!(proteins[0].taxon_id, 1); @@ -281,17 +213,8 @@ mod tests { let tmp_dir = TempDir::new("test_get_taxon").unwrap(); let database_file = create_database_file(&tmp_dir); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - - let proteins = - Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) - .unwrap(); + let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap()).unwrap(); let taxa = vec![1, 2, 6, 17]; for (i, protein) in proteins.proteins.iter().enumerate() { @@ -305,23 +228,11 @@ mod tests { let tmp_dir = TempDir::new("test_get_fa").unwrap(); let database_file = create_database_file(&tmp_dir); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - let proteins = - Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) - .unwrap(); + let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap()).unwrap(); for protein in proteins.proteins.iter() { - assert_eq!( - protein.get_functional_annotations(), - "GO:0009279;IPR:IPR016364;IPR:IPR008816" - ); + assert_eq!(protein.get_functional_annotations(), "GO:0009279;IPR:IPR016364;IPR:IPR008816"); } } @@ -331,19 +242,8 @@ mod tests { let tmp_dir = TempDir::new("test_get_fa").unwrap(); let database_file = create_database_file(&tmp_dir); - let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - - let proteins = Proteins::try_from_database_file_without_annotations( - database_file.to_str().unwrap(), - &taxon_aggregator - ) - .unwrap(); + let proteins = Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap()).unwrap(); let sep_char = SEPARATION_CHARACTER as char; let end_char = TERMINATION_CHARACTER as char; diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs deleted file mode 100644 index 70c65f6..0000000 --- a/sa-mappings/src/taxonomy.rs +++ /dev/null @@ -1,343 +0,0 @@ -//! This module provides a `TaxonAggregator` struct that is used to aggregate taxonomic information. -//! It uses a taxonomy file to create a taxonomic tree and performs aggregation using different -//! methods. - -use std::error::Error; - -use umgap::{ - agg::{ - count, - MultiThreadSafeAggregator - }, - rmq::{ - lca::LCACalculator, - mix::MixCalculator - }, - taxon::{ - read_taxa_file, - Taxon, - TaxonId, - TaxonList, - TaxonTree - } -}; - -/// A struct that represents a taxon aggregator. -pub struct TaxonAggregator { - /// A vector that contains the snapped taxon IDs. - snapping: Vec>, - /// The aggregator used to aggregate taxon IDs. - aggregator: Box, - /// The taxon list. - taxon_list: TaxonList -} - -/// An enum that specifies the aggregation method to use. -pub enum AggregationMethod { - /// The Lowest Common Ancestor (LCA) aggregation method. - Lca, - - /// The LCA* aggregation method. - LcaStar -} - -impl TaxonAggregator { - /// Creates a new `TaxonAggregator` with the given taxa and aggregation method. - /// - /// # Arguments - /// - /// * `taxa` - A vector of `Taxon` objects representing the taxa. - /// * `method` - An `AggregationMethod` enum specifying the aggregation method to use. - /// - /// # Returns - /// - /// Returns a new `TaxonAggregator` instance. - pub fn new(taxa: Vec, method: AggregationMethod) -> Self { - let taxon_tree = TaxonTree::new(&taxa); - let taxon_list = TaxonList::new(taxa); - let snapping = taxon_tree.snapping(&taxon_list, true); - - let aggregator: Box = match method { - AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), - AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) - }; - - Self { - snapping, - aggregator, - taxon_list - } - } - - /// Creates a new `TaxonAggregator` from a taxonomy file and an aggregation method. - /// - /// # Arguments - /// - /// * `file` - A string slice that represents the path to the taxonomy file. - /// * `method` - An `AggregationMethod` enum that specifies the aggregation method to use. - /// - /// # Returns - /// - /// Returns a `Result` containing the `TaxonAggregator` - /// - /// # Errors - /// - /// Returns a `Box` if an error occurred while reading the taxonomy file. - pub fn try_from_taxonomy_file( - file: &str, - method: AggregationMethod - ) -> Result> { - let taxons = read_taxa_file(file)?; - Ok(Self::new(taxons, method)) - } - - /// Checks if a taxon exists in the taxon list. - /// - /// # Arguments - /// - /// * `taxon` - The taxon ID to check. - /// - /// # Returns - /// - /// Returns a boolean value indicating whether the taxon exists in the taxon list. - pub fn taxon_exists(&self, taxon: TaxonId) -> bool { - self.taxon_list.get(taxon).is_some() - } - - /// Checks if a taxon is valid to be used during taxonomic aggregation - /// - /// # Arguments - /// - /// * `taxon` - The taxon ID to check. - /// - /// # Returns - /// - /// Returns a boolean value indicating whether the taxon exists and is valid - pub fn taxon_valid(&self, taxon: TaxonId) -> bool { - let optional_taxon = self.taxon_list.get(taxon); - match optional_taxon { - None => false, - Some(taxon) => taxon.valid - } - } - - /// Snaps a taxon to its closest ancestor in the taxonomic tree. - /// - /// # Arguments - /// - /// * `taxon` - The taxon ID to snap. - /// - /// # Returns - /// - /// Returns the snapped taxon ID, or panics if the taxon cannot be snapped. - pub fn snap_taxon(&self, taxon: TaxonId) -> TaxonId { - self.snapping[taxon].unwrap_or_else(|| panic!("Could not snap taxon with id {taxon}")) - } - - /// Aggregates a list of taxon IDs using the specified aggregation method. - /// - /// # Arguments - /// - /// * `taxa` - A vector of taxon IDs to aggregate. - /// * `clean_taxa` - If true, only the taxa which are stored as "valid" are used during - /// aggregation - /// - /// # Returns - /// - /// Returns the aggregated taxon ID wrapped in Some if aggregation succeeds, - /// Returns None if the list of taxa to aggregate is emtpy, - /// Panics if aggregation fails. - pub fn aggregate(&self, taxa: Vec) -> Option { - if taxa.is_empty() { - return None; - } - - let count = count(taxa.into_iter().map(|t| (t, 1.0_f32))); - Some( - self.aggregator.aggregate(&count).unwrap_or_else(|_| { - panic!("Could not aggregate following taxon ids: {:?}", &count) - }) - ) - } -} - -#[cfg(test)] -mod tests { - use std::{ - fs::File, - io::Write, - path::PathBuf - }; - - use tempdir::TempDir; - use umgap::rank::Rank; - - use super::*; - - fn create_taxonomy_file(tmp_dir: &TempDir) -> PathBuf { - let taxonomy_file = tmp_dir.path().join("taxonomy.tsv"); - let mut file = File::create(&taxonomy_file).unwrap(); - - writeln!(file, "1\troot\tno rank\t1\t\x01").unwrap(); - writeln!(file, "2\tBacteria\tsuperkingdom\t1\t\x01").unwrap(); - writeln!(file, "6\tAzorhizobium\tgenus\t1\t\x01").unwrap(); - writeln!(file, "7\tAzorhizobium caulinodans\tspecies\t6\t\x01").unwrap(); - writeln!(file, "9\tBuchnera aphidicola\tspecies\t6\t\x01").unwrap(); - writeln!(file, "10\tCellvibrio\tgenus\t6\t\x01").unwrap(); - writeln!(file, "11\tCellulomonas gilvus\tspecies\t10\t\x01").unwrap(); - writeln!(file, "13\tDictyoglomus\tgenus\t11\t\x01").unwrap(); - writeln!(file, "14\tDictyoglomus thermophilum\tspecies\t10\t\x01").unwrap(); - writeln!(file, "16\tMethylophilus\tgenus\t14\t\x01").unwrap(); - writeln!(file, "17\tMethylophilus methylotrophus\tspecies\t16\t\x01").unwrap(); - writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap(); - writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap(); - writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap(); - writeln!(file, "21\tInvalid\tspecies\t19\t\x00").unwrap(); - - taxonomy_file - } - - #[test] - fn test_new() { - TaxonAggregator::new( - vec![ - Taxon::new(1, "root".to_string(), Rank::NoRank, 1, true), - Taxon::new(2, "Bacteria".to_string(), Rank::Superkingdom, 1, true), - Taxon::new(6, "Azorhizobium".to_string(), Rank::Genus, 1, true), - Taxon::new(7, "Azorhizobium caulinodans".to_string(), Rank::Species, 6, true), - Taxon::new(9, "Buchnera aphidicola".to_string(), Rank::Species, 6, true), - Taxon::new(10, "Cellvibrio".to_string(), Rank::Genus, 6, true), - Taxon::new(11, "Cellulomonas gilvus".to_string(), Rank::Species, 10, true), - Taxon::new(13, "Dictyoglomus".to_string(), Rank::Genus, 11, true), - Taxon::new(14, "Dictyoglomus thermophilum".to_string(), Rank::Species, 10, true), - Taxon::new(16, "Methylophilus".to_string(), Rank::Genus, 14, true), - Taxon::new(17, "Methylophilus methylotrophus".to_string(), Rank::Species, 16, true), - Taxon::new(18, "Pelobacter".to_string(), Rank::Genus, 17, true), - Taxon::new(19, "Syntrophotalea carbinolica".to_string(), Rank::Species, 17, true), - Taxon::new(20, "Phenylobacterium".to_string(), Rank::Genus, 19, true), - Taxon::new(21, "Invalid".to_string(), Rank::Species, 19, false), - ], - AggregationMethod::Lca - ); - } - - #[test] - fn test_try_from_taxonomy_file() { - // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let _ = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - - let _ = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(); - } - - #[test] - fn test_taxon_exists() { - // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_taxon_exists").unwrap(); - - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - - for i in 0 ..= 20 { - if [0, 3, 4, 5, 8, 12, 15].contains(&i) { - assert!(!taxon_aggregator.taxon_exists(i)); - } else { - assert!(taxon_aggregator.taxon_exists(i)); - } - } - } - - #[test] - fn test_taxon_valid() { - // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_taxon_valid").unwrap(); - - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - - for i in [1, 2, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 19, 20].iter() { - assert!(taxon_aggregator.taxon_valid(*i)); - } - assert!(!taxon_aggregator.taxon_valid(21)); - assert!(!taxon_aggregator.taxon_valid(22)); - } - - #[test] - fn test_snap_taxon() { - // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_snap_taxon").unwrap(); - - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - - for i in 0 ..= 20 { - if ![0, 3, 4, 5, 8, 12, 15].contains(&i) { - assert_eq!(taxon_aggregator.snap_taxon(i), i); - } - } - } - - #[test] - fn test_aggregate_lca() { - // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_aggregate").unwrap(); - - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::Lca - ) - .unwrap(); - - assert_eq!(taxon_aggregator.aggregate(vec![]), None); - assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), Some(6)); - assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), Some(10)); - assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), Some(17)); - } - - #[test] - fn test_aggregate_lca_star() { - // Create a temporary directory for this test - let tmp_dir = TempDir::new("test_aggregate").unwrap(); - - let taxonomy_file = create_taxonomy_file(&tmp_dir); - - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( - taxonomy_file.to_str().unwrap(), - AggregationMethod::LcaStar - ) - .unwrap(); - - assert_eq!(taxon_aggregator.aggregate(vec![]), None); - assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), Some(6)); - assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), Some(10)); - assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), Some(19)); - } -} diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index ef774b2..5284546 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -1,53 +1,26 @@ use std::{ error::Error, fs::File, - io::{ - BufReader, - Read - }, + io::{BufReader, Read}, sync::Arc }; use axum::{ - extract::{ - DefaultBodyLimit, - State - }, + extract::{DefaultBodyLimit, State}, http::StatusCode, - routing::{ - get, - post - }, - Json, - Router + routing::post, + Json, Router }; use clap::Parser; use sa_compression::load_compressed_suffix_array; use sa_index::{ binary::load_suffix_array, - peptide_search::{ - analyse_all_peptides, - search_all_peptides, - OutputData, - SearchOnlyResult, - SearchResultWithAnalysis - }, - sa_searcher::Searcher, - suffix_to_protein_index::SparseSuffixToProtein, + peptide_search::{search_all_peptides, SearchResult}, + sa_searcher::SparseSearcher, SuffixArray }; -use sa_mappings::{ - functionality::FunctionAggregator, - proteins::Proteins, - taxonomy::{ - AggregationMethod, - TaxonAggregator - } -}; -use serde::{ - Deserialize, - Serialize -}; +use sa_mappings::proteins::Proteins; +use serde::Deserialize; /// Enum that represents all possible commandline arguments #[derive(Parser, Debug)] @@ -57,10 +30,7 @@ pub struct Arguments { #[arg(short, long)] database_file: String, #[arg(short, long)] - index_file: String, - #[arg(short, long)] - /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy. - taxonomy: String + index_file: String } /// Function used by serde to place a default value in the cutoff field of the input @@ -79,19 +49,16 @@ fn default_true() -> bool { /// # Arguments /// * `peptides` - List of peptides we want to process /// * `cutoff` - The maximum amount of matches to process, default value 10000 -/// * `equalize_I_and_L` - True if we want to equalize I and L during search +/// * `equate_il` - True if we want to equalize I and L during search /// * `clean_taxa` - True if we only want to use proteins marked as "valid" -#[derive(Debug, Deserialize, Serialize)] -#[allow(non_snake_case)] +#[derive(Debug, Deserialize)] struct InputData { - peptides: Vec, + peptides: Vec, #[serde(default = "default_cutoff")] // default value is 10000 cutoff: usize, #[serde(default = "bool::default")] // default value is false // TODO: maybe default should be true? - equalize_I_and_L: bool, - #[serde(default = "bool::default")] // default value is false - clean_taxa: bool + equate_il: bool } #[tokio::main] @@ -103,35 +70,6 @@ async fn main() { } } -/// Basic handler used to check the server status -async fn root() -> &'static str { - "Server is online" -} - -/// Endpoint executed for peptide matching and taxonomic and functional analysis -/// -/// # Arguments -/// * `state(searcher)` - The searcher object provided by the server -/// * `data` - InputData object provided by the user with the peptides to be searched and the config -/// -/// # Returns -/// -/// Returns the search and analysis results from the index as a JSON -async fn analyse( - State(searcher): State>, - data: Json -) -> Result>, StatusCode> { - let search_result = analyse_all_peptides( - &searcher, - &data.peptides, - data.cutoff, - data.equalize_I_and_L, - data.clean_taxa - ); - - Ok(Json(search_result)) -} - /// Endpoint executed for peptide matching, without any analysis /// /// # Arguments @@ -142,16 +80,10 @@ async fn analyse( /// /// Returns the search results from the index as a JSON async fn search( - State(searcher): State>, + State(searcher): State>, data: Json -) -> Result>, StatusCode> { - let search_result = search_all_peptides( - &searcher, - &data.peptides, - data.cutoff, - data.equalize_I_and_L, - data.clean_taxa - ); +) -> Result>, StatusCode> { + let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il); Ok(Json(search_result)) } @@ -169,55 +101,25 @@ async fn search( /// /// Returns any error occurring during the startup or uptime of the server async fn start_server(args: Arguments) -> Result<(), Box> { - let Arguments { - database_file, - index_file, - taxonomy - } = args; + let Arguments { database_file, index_file } = args; eprintln!(); eprintln!("📋 Started loading the suffix array..."); - let sa = load_suffix_array_file(&index_file)?; + let suffix_array = load_suffix_array_file(&index_file)?; eprintln!("✅ Successfully loaded the suffix array!"); - eprintln!("\tAmount of items: {}", sa.len()); - eprintln!("\tAmount of bits per item: {}", sa.bits_per_value()); - eprintln!("\tSample rate: {}", sa.sample_rate()); - - eprintln!(); - eprintln!("📋 Started loading the taxon file..."); - let taxon_id_calculator = - TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)?; - eprintln!("✅ Successfully loaded the taxon file!"); - eprintln!("\tAggregation method: LCA*"); - - eprintln!(); - eprintln!("📋 Started creating the function aggregator..."); - let function_aggregator = FunctionAggregator {}; - eprintln!("✅ Successfully created the function aggregator!"); + eprintln!("\tAmount of items: {}", suffix_array.len()); + eprintln!("\tAmount of bits per item: {}", suffix_array.bits_per_value()); + eprintln!("\tSample rate: {}", suffix_array.sample_rate()); eprintln!(); eprintln!("📋 Started loading the proteins..."); - let proteins = Proteins::try_from_database_file(&database_file, &taxon_id_calculator)?; - let suffix_index_to_protein = Box::new(SparseSuffixToProtein::new(&proteins.input_string)); + let proteins = Proteins::try_from_database_file(&database_file)?; eprintln!("✅ Successfully loaded the proteins!"); - let searcher = Arc::new(Searcher::new( - sa, - suffix_index_to_protein, - proteins, - taxon_id_calculator, - function_aggregator - )); + let searcher = Arc::new(SparseSearcher::new(suffix_array, proteins)); // build our application with a route let app = Router::new() - // `GET /` goes to `root` - .route("/", get(root)) - // `POST /analyse` goes to `analyse` and set max payload size to 5 MB - .route("/analyse", post(analyse)) - .layer(DefaultBodyLimit::max(5 * 10_usize.pow(6))) - .with_state(searcher.clone()) - // `POST /search` goes to `search` and set max payload size to 5 MB .route("/search", post(search)) .layer(DefaultBodyLimit::max(5 * 10_usize.pow(6))) .with_state(searcher);