From 9bd80fe1ec49ad78c9f530fd104d2f34517774a5 Mon Sep 17 00:00:00 2001
From: Bram Devlaminck <bram.devlaminck@gmail.com>
Date: Tue, 14 May 2024 10:57:26 +0200
Subject: [PATCH] move all suffix array code from personal repo to unipept repo

---
 .gitmodules                             |   3 +
 Cargo.lock                              | 720 +++++++++++++++++++++++-
 Cargo.toml                              |   8 +-
 libsais64-rs/Cargo.toml                 |  10 +
 libsais64-rs/builder.rs                 |  89 +++
 libsais64-rs/libsais                    |   1 +
 libsais64-rs/libsais-wrapper.h          |   4 +
 libsais64-rs/src/lib.rs                 |  37 ++
 sa-builder/Cargo.toml                   |   4 +
 sa-builder/build_suffix_array.pbs       |  41 ++
 sa-builder/src/binary.rs                | 134 +++++
 sa-builder/src/lib.rs                   |  77 +++
 sa-builder/src/main.rs                  |  39 +-
 sa-index/Cargo.toml                     |   7 +
 sa-index/src/lib.rs                     | 226 ++++++++
 sa-index/src/main.rs                    |   9 +-
 sa-index/src/peptide_search.rs          | 252 +++++++++
 sa-index/src/sa_searcher.rs             | 506 +++++++++++++++++
 sa-index/src/suffix_to_protein_index.rs | 159 ++++++
 sa-index/src/util.rs                    |  55 ++
 sa-mappings/Cargo.toml                  |   3 +
 sa-mappings/src/functionality.rs        | 104 ++--
 sa-mappings/src/proteins.rs             | 113 ++--
 sa-mappings/src/taxonomy.rs             |  59 +-
 sa-server/Cargo.toml                    |   7 +
 sa-server/src/main.rs                   | 185 +++++-
 26 files changed, 2720 insertions(+), 132 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 libsais64-rs/Cargo.toml
 create mode 100644 libsais64-rs/builder.rs
 create mode 160000 libsais64-rs/libsais
 create mode 100644 libsais64-rs/libsais-wrapper.h
 create mode 100644 libsais64-rs/src/lib.rs
 create mode 100644 sa-builder/build_suffix_array.pbs
 create mode 100644 sa-builder/src/binary.rs
 create mode 100644 sa-builder/src/lib.rs
 create mode 100644 sa-index/src/lib.rs
 create mode 100644 sa-index/src/peptide_search.rs
 create mode 100644 sa-index/src/sa_searcher.rs
 create mode 100644 sa-index/src/suffix_to_protein_index.rs
 create mode 100644 sa-index/src/util.rs

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..2bd9ebc
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "libsais64-rs/libsais"]
+	path = libsais64-rs/libsais
+	url = https://github.com/IlyaGrebnov/libsais.git
diff --git a/Cargo.lock b/Cargo.lock
index 1c10d09..27c3eed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -41,12 +41,66 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "anstream"
+version = "0.6.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
 [[package]]
 name = "anstyle"
 version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
 
+[[package]]
+name = "anstyle-parse"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.80"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.58",
+]
+
 [[package]]
 name = "attohttpc"
 version = "0.15.0"
@@ -54,7 +108,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fe174d1b67f7b2bafed829c09db039301eb5841f66e43be2cf60b326e7f8e2cc"
 dependencies = [
  "flate2",
- "http",
+ "http 0.2.12",
  "log",
  "native-tls",
  "openssl",
@@ -80,6 +134,74 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80"
 
+[[package]]
+name = "axum"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "axum-macros",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sync_wrapper 1.0.1",
+ "tokio",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a15c63fd72d41492dc4f497196f5da1fb04fb7529e631d73630d1b491e47a2e3"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper 0.1.2",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "axum-macros"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00c055ee2d014ae5981ce1016374e8213682aa14d9bf40e48ab48b5f3ef20eaa"
+dependencies = [
+ "heck 0.4.1",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.58",
+]
+
 [[package]]
 name = "backtrace"
 version = "0.3.71"
@@ -95,6 +217,29 @@ dependencies = [
  "rustc-demangle",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.69.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
+dependencies = [
+ "bitflags 2.5.0",
+ "cexpr",
+ "clang-sys",
+ "itertools",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.58",
+ "which",
+]
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -147,6 +292,15 @@ version = "1.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@@ -180,6 +334,17 @@ dependencies = [
  "half",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
 [[package]]
 name = "clap"
 version = "2.34.0"
@@ -189,7 +354,7 @@ dependencies = [
  "ansi_term",
  "atty",
  "bitflags 1.3.2",
- "strsim",
+ "strsim 0.8.0",
  "textwrap",
  "unicode-width",
  "vec_map",
@@ -202,6 +367,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0"
 dependencies = [
  "clap_builder",
+ "clap_derive",
 ]
 
 [[package]]
@@ -210,8 +376,22 @@ version = "4.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
 dependencies = [
+ "anstream",
  "anstyle",
  "clap_lex",
+ "strsim 0.11.1",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.58",
 ]
 
 [[package]]
@@ -220,6 +400,21 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
 
+[[package]]
+name = "cmake"
+version = "0.1.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "colorchoice"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -346,7 +541,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -429,6 +624,15 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
 
+[[package]]
+name = "futures-channel"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+dependencies = [
+ "futures-core",
+]
+
 [[package]]
 name = "futures-core"
 version = "0.3.30"
@@ -470,6 +674,12 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
+[[package]]
+name = "glob"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+
 [[package]]
 name = "half"
 version = "2.4.0"
@@ -489,6 +699,18 @@ dependencies = [
  "unicode-segmentation",
 ]
 
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.1.19"
@@ -504,6 +726,15 @@ version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
 
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "http"
 version = "0.2.12"
@@ -515,6 +746,87 @@ dependencies = [
  "itoa",
 ]
 
+[[package]]
+name = "http"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
+dependencies = [
+ "bytes",
+ "http 1.1.0",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http 1.1.0",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "hyper"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http 1.1.0",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body",
+ "hyper",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+]
+
 [[package]]
 name = "idna"
 version = "0.5.0"
@@ -533,9 +845,15 @@ checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
 dependencies = [
  "hermit-abi 0.3.9",
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -566,12 +884,44 @@ version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
 [[package]]
 name = "libc"
 version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 
+[[package]]
+name = "libdivsufsort-rs"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ae6d5de0472a582c6be43c392ba23e8ca59add069c8bd661f6b2a90690be3fb"
+dependencies = [
+ "cmake",
+]
+
+[[package]]
+name = "libloading"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
+dependencies = [
+ "cfg-if",
+ "windows-targets 0.52.4",
+]
+
+[[package]]
+name = "libsais64-rs"
+version = "0.1.0"
+dependencies = [
+ "bindgen",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.13"
@@ -584,6 +934,12 @@ version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
 [[package]]
 name = "memchr"
 version = "2.7.2"
@@ -600,6 +956,18 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
 [[package]]
 name = "miniz_oxide"
 version = "0.7.2"
@@ -609,6 +977,17 @@ dependencies = [
  "adler",
 ]
 
+[[package]]
+name = "mio"
+version = "0.8.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "native-tls"
 version = "0.2.11"
@@ -627,6 +1006,16 @@ dependencies = [
  "tempfile",
 ]
 
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.18"
@@ -636,6 +1025,16 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "num_cpus"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+dependencies = [
+ "hermit-abi 0.3.9",
+ "libc",
+]
+
 [[package]]
 name = "object"
 version = "0.32.2"
@@ -716,6 +1115,26 @@ version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
+[[package]]
+name = "pin-project"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.58",
+]
+
 [[package]]
 name = "pin-project-lite"
 version = "0.2.14"
@@ -768,6 +1187,16 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "prettyplease"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.58",
+]
+
 [[package]]
 name = "proc-macro-error"
 version = "1.0.4"
@@ -941,6 +1370,12 @@ version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
 
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
 [[package]]
 name = "rustix"
 version = "0.38.32"
@@ -951,9 +1386,15 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "rustversion"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "092474d1a01ea8278f69e6a358998405fae5b8b963ddaeb2b0b04a128bf1dfb0"
+
 [[package]]
 name = "ryu"
 version = "1.0.17"
@@ -963,10 +1404,25 @@ checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1"
 [[package]]
 name = "sa-builder"
 version = "0.1.0"
+dependencies = [
+ "clap 4.5.4",
+ "libdivsufsort-rs",
+ "libsais64-rs",
+ "sa-mappings",
+]
 
 [[package]]
 name = "sa-index"
 version = "0.1.0"
+dependencies = [
+ "clap 4.5.4",
+ "rayon",
+ "sa-builder",
+ "sa-mappings",
+ "serde",
+ "serde_json",
+ "umgap",
+]
 
 [[package]]
 name = "sa-mappings"
@@ -974,10 +1430,25 @@ version = "0.1.0"
 dependencies = [
  "bytelines",
  "fa-compression",
+ "serde",
+ "serde_json",
  "tempdir",
  "umgap",
 ]
 
+[[package]]
+name = "sa-server"
+version = "0.1.0"
+dependencies = [
+ "axum",
+ "clap 4.5.4",
+ "sa-builder",
+ "sa-index",
+ "sa-mappings",
+ "serde",
+ "tokio",
+]
+
 [[package]]
 name = "same-file"
 version = "1.0.6"
@@ -993,7 +1464,7 @@ version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -1041,21 +1512,71 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.115"
+version = "1.0.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
+checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3"
 dependencies = [
  "itoa",
  "ryu",
  "serde",
 ]
 
+[[package]]
+name = "serde_path_to_error"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af99884400da37c88f5e9146b7f1fd0fbcae8f6eec4e9da38b67d05486f814a6"
+dependencies = [
+ "itoa",
+ "serde",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "smallvec"
+version = "1.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
+
+[[package]]
+name = "socket2"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "strsim"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "structopt"
 version = "0.3.26"
@@ -1073,7 +1594,7 @@ version = "0.4.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
 dependencies = [
- "heck",
+ "heck 0.3.3",
  "proc-macro-error",
  "proc-macro2",
  "quote",
@@ -1092,7 +1613,7 @@ version = "0.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e6e163a520367c465f59e0a61a23cfae3b10b6546d78b6f672a382be79f7110"
 dependencies = [
- "heck",
+ "heck 0.3.3",
  "proc-macro2",
  "quote",
  "syn 1.0.109",
@@ -1120,6 +1641,18 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "sync_wrapper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+
+[[package]]
+name = "sync_wrapper"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
+
 [[package]]
 name = "tempdir"
 version = "0.3.7"
@@ -1139,7 +1672,7 @@ dependencies = [
  "cfg-if",
  "fastrand",
  "rustix",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -1184,7 +1717,72 @@ checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
 dependencies = [
  "backtrace",
  "bytes",
+ "libc",
+ "mio",
+ "num_cpus",
  "pin-project-lite",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.58",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project",
+ "pin-project-lite",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
+
+[[package]]
+name = "tower-service"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "log",
+ "pin-project-lite",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
 ]
 
 [[package]]
@@ -1252,6 +1850,12 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "utf8parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -1350,6 +1954,18 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -1381,13 +1997,37 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.52.4",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
 ]
 
 [[package]]
@@ -1396,51 +2036,93 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
 ]
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.4"
diff --git a/Cargo.toml b/Cargo.toml
index 461b751..728b06e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,9 +1,11 @@
 [workspace]
 resolver = "2"
 
-members = [ 
+members = [
     "fa-compression",
+    "libsais64-rs",
     "sa-builder",
     "sa-index",
-    "sa-mappings"
-, "sa-server"]
+    "sa-mappings",
+    "sa-server"
+]
diff --git a/libsais64-rs/Cargo.toml b/libsais64-rs/Cargo.toml
new file mode 100644
index 0000000..36799c3
--- /dev/null
+++ b/libsais64-rs/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "libsais64-rs"
+version = "0.1.0"
+edition = "2021"
+build = "builder.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[build-dependencies]
+bindgen = "0.69.4"
diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs
new file mode 100644
index 0000000..d4e26b2
--- /dev/null
+++ b/libsais64-rs/builder.rs
@@ -0,0 +1,89 @@
+use std::env;
+use std::error::Error;
+use std::fmt::{Display, Formatter};
+use std::path::PathBuf;
+use std::process::{Command, ExitStatus};
+
+/// Custom error for compilation of the C library
+#[derive(Debug)]
+struct CompileError<'a> {
+    command: &'a str,
+    exit_code: Option<i32>,
+}
+
+impl<'a> Display for CompileError<'a> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let end_text = if let Some(code) = self.exit_code {
+            format!("with exit code {}", code)
+        } else {
+            "without exit code".to_string()
+        };
+        let text = format!("Command with name `{}` failed {}", self.command, end_text);
+        write!(f, "{}", text)
+    }
+}
+
+impl<'a> Error for CompileError<'a> {}
+
+/// Handles the exit statuses of the executed bash commands
+///
+/// # Arguments
+/// * `name` - Name of the executed bash command
+/// * `exit_states` - The exit status of the executed bash command
+///
+/// # Returns
+///
+/// Returns () if the exit status was success
+/// 
+/// # Errors
+/// 
+/// Returns a CompilationError if the command failed
+fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), CompileError> {
+    match exit_status.success() {
+        true => Ok(()),
+        false => Err(CompileError {
+            command: name,
+            exit_code: exit_status.code(),
+        }),
+    }
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    // compile the c library
+    Command::new("rm")
+        .args(["libsais/CMakeCache.txt"])
+        .status().unwrap_or_default(); // if removing fails, it is since the cmake cache did not exist, we just can ignore it
+    exit_status_to_result(
+        "cmake",
+        Command::new("cmake")
+            .args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"])
+            .status()?,
+    )?;
+    exit_status_to_result(
+        "make",
+        Command::new("make").args(["-C", "libsais"]).status()?,
+    )?;
+
+    // link the c libsais library to rust
+    println!("cargo:rustc-link-search=native=libsais64-rs/libsais");
+    println!("cargo:rustc-link-lib=static=libsais");
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header("libsais-wrapper.h")
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+        // Finish the builder and generate the bindings.
+        .generate()?;
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR")?);
+    bindings.write_to_file(out_path.join("bindings.rs"))?;
+
+    Ok(())
+}
diff --git a/libsais64-rs/libsais b/libsais64-rs/libsais
new file mode 160000
index 0000000..2e5f9c2
--- /dev/null
+++ b/libsais64-rs/libsais
@@ -0,0 +1 @@
+Subproject commit 2e5f9c22109847f42f8aeb7d4865e7ffa5dd0e11
diff --git a/libsais64-rs/libsais-wrapper.h b/libsais64-rs/libsais-wrapper.h
new file mode 100644
index 0000000..fbfe0b9
--- /dev/null
+++ b/libsais64-rs/libsais-wrapper.h
@@ -0,0 +1,4 @@
+#include "libsais/include/libsais64.h"
+
+
+int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq);
\ No newline at end of file
diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs
new file mode 100644
index 0000000..59b1959
--- /dev/null
+++ b/libsais64-rs/src/lib.rs
@@ -0,0 +1,37 @@
+// ignore errors because of different style in c code and import the c bindings
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+
+
+/// Builds the suffix array over the `text` using the libsais64 algorithm
+///
+/// # Arguments
+/// * `text` - The text used for suffix array construction
+///
+/// # Returns
+///
+/// Returns Some with the suffix array build over the text if construction succeeds
+/// Returns None if construction of the suffix array failed
+pub fn sais64(text: &[u8]) -> Option<Vec<i64>> {
+    let mut sa = vec![0; text.len()];
+    let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) };
+    if exit_code == 0 {
+        Some(sa)
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{sais64};
+
+    #[test]
+    fn check_build_sa_with_libsais64() {
+        let text = "banana$";
+        let sa = sais64(text.as_bytes());
+        assert_eq!(sa, Some(vec![6, 5, 3, 1, 0, 4, 2]));
+    }
+}
diff --git a/sa-builder/Cargo.toml b/sa-builder/Cargo.toml
index 31d44b6..9937384 100644
--- a/sa-builder/Cargo.toml
+++ b/sa-builder/Cargo.toml
@@ -6,3 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+clap = { version = "4.4.8", features = ["derive"] }
+libsais64-rs = { path = "../libsais64-rs" }
+libdivsufsort-rs = "0.1.0"
+sa-mappings = { path = "../sa-mappings" }
diff --git a/sa-builder/build_suffix_array.pbs b/sa-builder/build_suffix_array.pbs
new file mode 100644
index 0000000..fac37e6
--- /dev/null
+++ b/sa-builder/build_suffix_array.pbs
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+#########################################################################################################
+### This script is designed to run on the Ghent university HPC                                        ###
+###                                                                                                   ###
+### how to use:                                                                                       ###
+### 1) Swap to the high-memory gallade cluster by executing `module swap cluster/gallade`             ###
+### 2) Navigate the to root of the project                                                            ###
+### 3) Submit the job to the queue with `qsub suffixarray/build_suffix_array.pbs`                     ###
+#########################################################################################################
+
+# go to cluster with high memory
+module swap cluster/gallade
+
+# define requested memory, cpu resources and email notifications
+#PBS -m abe
+#PBS -l walltime=10:00:00
+#PBS -l mem=750gb
+# ask for 1 node, 1 cpu (not more needed since we don't have parallelism)
+#PBS -l nodes=1:ppn=1
+#PBS -N suffix_array_construction_uniprot
+
+# define output and error files
+#PBS -o stdout.$PBS_JOBID
+#PBS -e stderr.$PBS_JOBID
+
+prefix="$VSC_DATA_VO/bram/"
+
+# load Rust
+module load Rust/1.75.0-GCCcore-12.3.0
+module load Clang/16.0.6-GCCcore-12.3.0 # needed to build the bindings from Rust to C
+module load CMake/3.26.3-GCCcore-12.3.0
+
+# go to current working dir and execute
+cd $PBS_O_WORKDIR
+
+# compile
+cargo build --release
+
+# execute
+./target/release/suffixarray_builder -d "$prefix"uniprot_protein_database_minimal.tsv -t "$prefix"taxons.tsv --sparseness-factor 3 --construction-algorithm lib-div-suf-sort -o "$prefix"uniprot_suffix_array_sparse3.bin
diff --git a/sa-builder/src/binary.rs b/sa-builder/src/binary.rs
new file mode 100644
index 0000000..78e7603
--- /dev/null
+++ b/sa-builder/src/binary.rs
@@ -0,0 +1,134 @@
+use std::cmp::min;
+use std::error::Error;
+use std::fs::{File, OpenOptions};
+use std::io::{Read, Write};
+
+const ONE_GIB: usize = 2usize.pow(30);
+
+/// Trait implemented by structs that are binary serializable
+/// In our case this is will be a [i64] since the suffix array is a Vec<i64>
+pub trait Serializable {
+
+    /// Serializes self into a vector of bytes
+    ///
+    /// # Returns
+    ///
+    /// Returns a vector of bytes
+    fn serialize(&self) -> Vec<u8>;
+}
+
+impl Serializable for [i64] {
+    fn serialize(&self) -> Vec<u8> {
+        let mut res = vec![];
+        self.iter().for_each(|entry|
+            res.extend_from_slice(&entry.to_le_bytes())
+        );
+        res
+    }
+}
+
+/// Deserializes a vector of bytes into the suffix array
+///
+/// # Arguments
+/// * `data` - The raw bytes needed to be serialized into a suffix array
+///
+/// # Returns
+///
+/// Returns the suffix array, a Vec<i64>
+fn deserialize_sa(data: &[u8]) -> Vec<i64> {
+    let mut res = vec![];
+    if data.len() % 8 != 0 {
+        panic!("Serialized data is not a multiple of 8 bytes long!")
+    }
+    for start in (0..data.len()).step_by(8) {
+        res.push(i64::from_le_bytes(data[start..start + 8].try_into().unwrap()));
+    }
+    res
+}
+
+/// Writes the given suffix array with the `sparseness_factor` factor to the given file
+///
+/// # Arguments
+/// * `sparseness_factor` - The sparseness factor of the suffix array
+/// * `suffix_array` - The suffix array
+/// * `filename` - The name of the file we want to write the suffix array to
+///
+/// # Returns
+///
+/// Returns () if writing away the suffix array succeeded
+///
+/// # Errors
+///
+/// Returns an io::Error if writing away the suffix array failed
+pub fn write_suffix_array(sparseness_factor: u8, suffix_array: &[i64], filename: &str) -> Result<(), std::io::Error> {
+    // create the file
+    let mut f = OpenOptions::new()
+        .create(true)
+        .write(true)
+        .truncate(true) // if the file already exists, empty the file
+        .open(filename)?;
+    f.write_all(&[sparseness_factor])?; // write the sample rate as the first byte
+
+    // write 1 GiB at a time, to minimize extra used memory since we need to translate i64 to [u8; 8]
+    let sa_len = suffix_array.len();
+    for start_index in (0..sa_len).step_by(ONE_GIB/8) {
+        let end_index = min(start_index + ONE_GIB/8, sa_len);
+        f.write_all(&suffix_array[start_index..end_index].serialize())?;
+    }
+
+    Ok(())
+}
+
+/// Loads the suffix array from the file with the given `filename`
+///
+/// # Arguments
+/// * `filename` - The filename of the file where the suffix array is stored
+///
+/// # Returns
+///
+/// Returns the sample rate of the suffix array, together with the suffix array
+///
+/// # Errors
+///
+/// Returns any error from opening the file or reading the file
+pub fn load_suffix_array(filename: &str) -> Result<(u8, Vec<i64>), Box<dyn Error>> {
+    let mut file = &File::open(filename)?;
+    let mut sparseness_factor_buffer = [0_u8; 1];
+    file.read_exact(&mut sparseness_factor_buffer).map_err(|_| "Could not read the sample rate from the binary file")?;
+    let sparseness_factor = sparseness_factor_buffer[0];
+
+    let mut sa = vec![];
+    loop {
+        let mut buffer = vec![];
+        // use take in combination with read_to_end to ensure that the buffer will be completely filled (except when the file is smaller than the buffer)
+        let count = file.take(ONE_GIB as u64).read_to_end(&mut buffer)?;
+        if count == 0 {
+            break;
+        }
+        sa.extend_from_slice(&deserialize_sa(&buffer[..count]));
+    }
+
+    Ok((sparseness_factor, sa))
+}
+
+
+#[cfg(test)]
+mod tests {
+    use crate::binary::{deserialize_sa, Serializable};
+
+    #[test]
+    fn test_serialize_deserialize() {
+        let data: Vec<i64> = vec![5, 2165487362, -12315135];
+        let serialized = data.serialize();
+        let deserialized = deserialize_sa(serialized.as_ref());
+        assert_eq!(data, deserialized);
+    }
+
+    #[test]
+    fn test_serialize_deserialize_empty() {
+        let data: Vec<i64> = vec![];
+        let serialized = data.serialize();
+        let deserialized = deserialize_sa(serialized.as_ref());
+        assert_eq!(data, deserialized);
+    }
+}
\ No newline at end of file
diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs
new file mode 100644
index 0000000..d009656
--- /dev/null
+++ b/sa-builder/src/lib.rs
@@ -0,0 +1,77 @@
+pub mod binary;
+
+use std::error::Error;
+use clap::{Parser, ValueEnum};
+
+/// Enum that represents all possible commandline arguments
+#[derive(Parser, Debug)]
+pub struct Arguments {
+    /// File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a `#`.
+    #[arg(short, long)]
+    pub database_file: String,
+    #[arg(short, long)]
+    /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy.
+    pub taxonomy: String,
+    /// Output file to store the built index.
+    #[arg(short, long)]
+    pub output: String,
+    /// The sparseness_factor used on the suffix array (default value 1, which means every value in the SA is used)
+    #[arg(long, default_value_t = 1)]
+    pub sparseness_factor: u8,
+    #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)]
+    pub construction_algorithm: SAConstructionAlgorithm,
+}
+
+/// Enum representing the two possible algorithms to construct the suffix array
+#[derive(ValueEnum, Clone, Debug, PartialEq)]
+pub enum SAConstructionAlgorithm {
+    LibDivSufSort,
+    LibSais,
+}
+
+/// Gets the current time in ms
+///
+/// # Arguments
+/// * `data` - The text on which we want to build the suffix array
+/// * `construction_algorithm` - The algorithm used during construction
+/// * `sparseness_factor` - The sparseness factor used on the suffix array
+/// 
+/// # Returns
+///
+/// Returns the constructed suffix array
+///
+/// # Errors
+///
+/// The errors that occurred during the building of the suffix array itself
+pub fn build_sa(data: &mut Vec<u8>, construction_algorithm: &SAConstructionAlgorithm, sparseness_factor: u8) -> Result<Vec<i64>, Box<dyn Error>> {
+    
+    // translate all L's to a I
+    for character in data.iter_mut() {
+        if *character == b'L' {
+            *character = b'I'
+        }
+    }
+    
+    let mut sa = match construction_algorithm {
+        SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(data),
+        SAConstructionAlgorithm::LibDivSufSort => {
+            libdivsufsort_rs::divsufsort64(data)
+        }
+    }.ok_or("Building suffix array failed")?;
+
+    // make the SA sparse and decrease the vector size if we have sampling (== sampling_rate > 1)
+    if sparseness_factor > 1 {
+        let mut current_sampled_index = 0;
+        for i in 0..sa.len() {
+            let current_sa_val = sa[i];
+            if current_sa_val % sparseness_factor as i64 == 0 {
+                sa[current_sampled_index] = current_sa_val;
+                current_sampled_index += 1;
+            }
+        }
+        // make shorter
+        sa.resize(current_sampled_index, 0);
+    }
+
+    Ok(sa)
+}
\ No newline at end of file
diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs
index e7a11a9..d738540 100644
--- a/sa-builder/src/main.rs
+++ b/sa-builder/src/main.rs
@@ -1,3 +1,38 @@
+use clap::Parser;
+use sa_mappings::proteins::Proteins;
+use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator};
+use sa_builder::{Arguments, build_sa};
+use sa_builder::binary::write_suffix_array;
+
 fn main() {
-    println!("Hello, world!");
-}
+    let args = Arguments::parse();
+    let Arguments { database_file, taxonomy, output, sparseness_factor, construction_algorithm } = args;
+    let taxon_id_calculator = TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar);  
+    if let Err(err) = taxon_id_calculator {
+        eprintln!("{}", err);
+        std::process::exit(1);
+    }
+    
+    let taxon_id_calculator = taxon_id_calculator.unwrap();
+    
+    // read input
+    let data = Proteins::try_from_database_file_without_annotations(&database_file, &taxon_id_calculator);
+    if let Err(err) = data {
+        eprintln!("{}", err);
+        std::process::exit(1);
+    }
+    let mut data = data.unwrap();
+    // calculate sa
+    let sa = build_sa(&mut data, &construction_algorithm, sparseness_factor);
+    if let Err(err) = sa {
+        eprintln!("{}", err);
+        std::process::exit(1);
+    }
+    let sa = sa.unwrap();
+    
+    // output the build SA
+    if let Err(err) = write_suffix_array(sparseness_factor, &sa, &output) {
+        eprintln!("{}", err);
+        std::process::exit(1);
+    };
+}
\ No newline at end of file
diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml
index 3470891..c355bef 100644
--- a/sa-index/Cargo.toml
+++ b/sa-index/Cargo.toml
@@ -6,3 +6,10 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+clap = { version = "4.4.8", features = ["derive"] }
+umgap = "1.1.0"
+rayon = "1.8.1"
+serde = { version = "1.0.197", features = ["derive"] }
+sa-builder = { path = "../sa-builder" }
+sa-mappings = { path = "../sa-mappings" }
+serde_json = "1.0.116"
diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs
new file mode 100644
index 0000000..90753a6
--- /dev/null
+++ b/sa-index/src/lib.rs
@@ -0,0 +1,226 @@
+use std::error::Error;
+use std::num::NonZeroUsize;
+
+use clap::{arg, Parser, ValueEnum};
+
+use sa_mappings::functionality::FunctionAggregator;
+use sa_mappings::proteins::Proteins;
+use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator};
+use sa_builder::{build_sa, SAConstructionAlgorithm};
+use sa_builder::binary::{load_suffix_array, write_suffix_array};
+
+use crate::peptide_search::{analyse_all_peptides, search_all_peptides};
+use crate::sa_searcher::Searcher;
+use crate::suffix_to_protein_index::{
+    DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle,
+};
+use crate::util::{get_time_ms, read_lines};
+
+pub mod peptide_search;
+pub mod sa_searcher;
+pub mod suffix_to_protein_index;
+pub mod util;
+
+/// Enum that represents the 2 kinds of search that are supported
+#[derive(ValueEnum, Clone, Debug, PartialEq)]
+pub enum SearchMode {
+    Search,
+    Analysis,
+}
+
+/// Enum that represents all possible commandline arguments
+#[derive(Parser, Debug)]
+pub struct Arguments {
+    /// File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a `#`.
+    #[arg(short, long)]
+    database_file: String,
+    #[arg(short, long)]
+    search_file: Option<String>,
+    #[arg(short, long)]
+    /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy.
+    taxonomy: String,
+    /// This will only build the tree and stop after that is completed. Used during benchmarking.
+    #[arg(long)]
+    build_only: bool,
+    /// Output file to store the built index.
+    #[arg(short, long)]
+    output: Option<String>,
+    /// The sparseness factor used on the suffix array (default value 1, which means every value in the SA is used)
+    #[arg(long, default_value_t = 1)]
+    sparseness_factor: u8,
+    /// Set the style used to map back from the suffix to the protein. 2 options <sparse> or <dense>. Dense is default
+    /// Dense uses O(n) memory with n the size of the input text, and takes O(1) time to find the mapping
+    /// Sparse uses O(m) memory with m the number of proteins, and takes O(log m) to find the mapping
+    #[arg(long, value_enum, default_value_t = SuffixToProteinMappingStyle::Sparse)]
+    suffix_to_protein_mapping: SuffixToProteinMappingStyle,
+    #[arg(long)]
+    load_index: Option<String>,
+    #[arg(short, long, value_enum, default_value_t = SAConstructionAlgorithm::LibSais)]
+    construction_algorithm: SAConstructionAlgorithm,
+    /// Assume the resulting taxon ID is root (1) whenever a peptide matches >= cutoff proteins
+    #[arg(long, default_value_t = 10000)]
+    cutoff: usize,
+    #[arg(long)]
+    threads: Option<NonZeroUsize>,
+    #[arg(long)]
+    equalize_i_and_l: bool,
+    #[arg(long)]
+    clean_taxa: bool,
+    #[arg(long, value_enum, default_value_t = SearchMode::Analysis)]
+    search_mode: SearchMode
+}
+
+
+/// Run the suffix array program
+///
+/// # Arguments
+/// * `args` - The commandline arguments provided to the program
+///
+/// # Returns
+///
+/// Unit
+/// 
+/// # Errors
+/// 
+/// Returns all possible errors that occurred during the program
+pub fn run(mut args: Arguments) -> Result<(), Box<dyn Error>> {
+    let taxon_id_calculator =
+        TaxonAggregator::try_from_taxonomy_file(&args.taxonomy, AggregationMethod::LcaStar)?;
+
+    let sa = match &args.load_index {
+        // load SA from file
+        Some(index_file_name) => {
+            let (sparseness_factor, sa) = load_suffix_array(index_file_name)?;
+            args.sparseness_factor = sparseness_factor;
+            // println!("Loading the SA took {} ms and loading the proteins + SA took {} ms", end_loading_ms - start_loading_ms, end_loading_ms - start_reading_proteins_ms);
+            // TODO: some kind of security check that the loaded database file and SA match
+            sa
+        }
+        // build the SA
+        None => {
+            let protein_sequences =
+                Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?;
+            build_sa(
+                &mut protein_sequences.input_string.clone(),
+                &args.construction_algorithm,
+                args.sparseness_factor,
+            )?
+        }
+    };
+
+    let proteins = Proteins::try_from_database_file(&args.database_file, &taxon_id_calculator)?;
+
+    if let Some(output) = &args.output {
+        write_suffix_array(args.sparseness_factor, &sa, output)?;
+    }
+
+    // option that only builds the tree, but does not allow for querying (easy for benchmark purposes)
+    if args.build_only {
+        return Ok(());
+    }
+
+    // build the right mapping index, use box to be able to store both types in this variable
+    let suffix_index_to_protein: Box<dyn SuffixToProteinIndex> =
+        match args.suffix_to_protein_mapping {
+            SuffixToProteinMappingStyle::Dense => {
+                Box::new(DenseSuffixToProtein::new(&proteins.input_string))
+            }
+            SuffixToProteinMappingStyle::Sparse => {
+                Box::new(SparseSuffixToProtein::new(&proteins.input_string))
+            }
+        };
+
+    let functional_aggregator = FunctionAggregator {};
+
+    let searcher = Searcher::new(
+        sa,
+        args.sparseness_factor,
+        suffix_index_to_protein,
+        proteins,
+        taxon_id_calculator,
+        functional_aggregator,
+    );
+
+    execute_search(&searcher, &args)?;
+    Ok(())
+}
+
+/// Execute the search using the provided programs
+///
+/// # Arguments
+/// * `searcher` - The Searcher which contains the protein database
+/// * `args` - The arguments used to start the program
+///
+/// # Returns
+///
+/// Unit
+///
+/// # Errors
+///
+/// Returns possible errors that occurred during search
+fn execute_search(searcher: &Searcher, args: &Arguments) -> Result<(), Box<dyn Error>> {
+    let cutoff = args.cutoff;
+    let search_file = args
+        .search_file
+        .as_ref()
+        .ok_or("No peptide file provided to search in the database")?;
+
+    let start_time = get_time_ms()?;
+    let lines = read_lines(search_file)?;
+    let all_peptides: Vec<String> = lines.map_while(Result::ok).collect();
+
+    // Explicitly set the number of threads to use if the commandline argument was set
+    if let Some(threads) = args.threads {
+        rayon::ThreadPoolBuilder::new()
+            .num_threads(threads.get())
+            .build_global()?;
+    }
+
+    match args.search_mode {
+        SearchMode::Search => {
+            let search_result = search_all_peptides(
+                searcher,
+                &all_peptides,
+                cutoff,
+                args.equalize_i_and_l,
+                args.clean_taxa,
+            );
+            println!("{}", serde_json::to_string(&search_result)?);
+        }
+        SearchMode::Analysis => {
+            let search_result = analyse_all_peptides(
+                searcher,
+                &all_peptides,
+                cutoff,
+                args.equalize_i_and_l,
+                args.clean_taxa,
+            );
+            println!("{}", serde_json::to_string(&search_result)?);
+        }
+    }
+        
+    let end_time = get_time_ms()?;
+
+    // output to other channel to prevent integrating it into the actual output
+    eprintln!(
+        "Spend {} ms to search the whole file",
+        end_time - start_time
+    );
+
+    Ok(())
+}
+
+/// Custom trait implemented by types that have a value that represents NULL
+pub trait Nullable<T> {
+    const NULL: T;
+
+    fn is_null(&self) -> bool;
+}
+
+impl Nullable<u32> for u32 {
+    const NULL: u32 = u32::MAX;
+
+    fn is_null(&self) -> bool {
+        *self == Self::NULL
+    }
+}
diff --git a/sa-index/src/main.rs b/sa-index/src/main.rs
index e7a11a9..cd56323 100644
--- a/sa-index/src/main.rs
+++ b/sa-index/src/main.rs
@@ -1,3 +1,10 @@
+use clap::Parser;
+use sa_index::{Arguments, run};
+
 fn main() {
-    println!("Hello, world!");
+    let args = Arguments::parse();
+    if let Err(error) = run(args) {
+        eprintln!("{}", error);
+        std::process::exit(1);
+    };
 }
diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs
new file mode 100644
index 0000000..d1277c0
--- /dev/null
+++ b/sa-index/src/peptide_search.rs
@@ -0,0 +1,252 @@
+use crate::sa_searcher::{SearchAllSuffixesResult, Searcher};
+use rayon::prelude::*;
+use sa_mappings::functionality::FunctionalAggregation;
+use sa_mappings::proteins::Protein;
+use serde::Serialize;
+
+/// Struct representing a collection of `SearchResultWithAnalysis` or `SearchOnlyResult` results
+#[derive(Debug, Serialize)]
+pub struct OutputData<T: Serialize> {
+    result: Vec<T>,
+}
+
+/// Struct representing the search result of the `sequence` in the index, including the analyses
+#[derive(Debug, Serialize)]
+pub struct SearchResultWithAnalysis {
+    sequence: String,
+    lca: Option<usize>,
+    taxa: Vec<usize>,
+    uniprot_accession_numbers: Vec<String>,
+    fa: Option<FunctionalAggregation>,
+    cutoff_used: bool,
+}
+
+/// Struct representing the search result of the `sequence` in the index (without the analyses)
+#[derive(Debug, Serialize)]
+pub struct SearchOnlyResult {
+    sequence: String,
+    proteins: Vec<ProteinInfo>,
+    cutoff_used: bool,
+}
+
+/// Struct that represents all information known about a certain protein in our database
+#[derive(Debug, Serialize)]
+pub struct ProteinInfo {
+    taxon: usize,
+    uniprot_accession: String,
+    functional_annotations: Vec<String>,
+}
+
+/// Searches the `peptide` in the index multithreaded and retrieves the matching proteins
+///
+/// # Arguments
+/// * `searcher` - The Searcher which contains the protein database
+/// * `peptide` - The peptide that is being searched in the index
+/// * `cutoff` - The maximum amount of matches we want to process from the index
+/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search
+/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy
+///
+/// # Returns
+///
+/// Returns Some if matches are found.
+/// The first argument is true if the cutoff is used, otherwise false
+/// The second argument is a list of all matching proteins for the peptide
+/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the sparseness factor k used in the index
+pub fn search_proteins_for_peptide<'a>(
+    searcher: &'a Searcher,
+    peptide: &str,
+    cutoff: usize,
+    equalize_i_and_l: bool,
+    clean_taxa: bool,
+) -> Option<(bool, Vec<&'a Protein>)> {
+    let peptide = peptide.strip_suffix('\n').unwrap_or(peptide).to_uppercase();
+
+    // words that are shorter than the sample rate are not searchable
+    if peptide.len() < searcher.sparseness_factor as usize {
+        return None;
+    }
+
+    let suffix_search =
+        searcher.search_matching_suffixes(peptide.as_bytes(), cutoff, equalize_i_and_l);
+    let mut cutoff_used = false;
+    let suffixes = match suffix_search {
+        SearchAllSuffixesResult::MaxMatches(matched_suffixes) => {
+            cutoff_used = true;
+            matched_suffixes
+        }
+        SearchAllSuffixesResult::SearchResult(matched_suffixes) => matched_suffixes,
+        SearchAllSuffixesResult::NoMatches => {
+            return None;
+        }
+    };
+
+    let mut proteins = searcher.retrieve_proteins(&suffixes);
+    if clean_taxa {
+        proteins.retain(|protein| searcher.taxon_valid(protein))
+    }
+
+    Some((cutoff_used, proteins))
+}
+
+
+/// Searches the `peptide` in the index multithreaded and retrieves the protein information from the database
+/// This does NOT perform any of the analyses, it only retrieves the functional and taxonomic annotations
+///
+/// # Arguments
+/// * `searcher` - The Searcher which contains the protein database
+/// * `peptide` - The peptide that is being searched in the index
+/// * `cutoff` - The maximum amount of matches we want to process from the index
+/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search
+/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy
+///
+/// # Returns
+///
+/// Returns Some(SearchOnlyResult) if the peptide has matches
+/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the sparseness factor k used in the index
+pub fn search_peptide_retrieve_annotations(
+    searcher: &Searcher,
+    peptide: &str,
+    cutoff: usize,
+    equalize_i_and_l: bool,
+    clean_taxa: bool,
+) -> Option<SearchOnlyResult> {
+    let (cutoff_used, proteins) =
+        search_proteins_for_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)?;
+
+    let annotations = searcher.get_all_functional_annotations(&proteins);
+
+    let mut protein_info: Vec<ProteinInfo> = vec![];
+    for (&protein, annotations) in proteins.iter().zip(annotations) {
+        protein_info.push(ProteinInfo {
+            taxon: protein.taxon_id,
+            uniprot_accession: protein.uniprot_id.clone(),
+            functional_annotations: annotations,
+        })
+    }
+
+    Some(SearchOnlyResult {
+        sequence: peptide.to_string(),
+        proteins: protein_info,
+        cutoff_used,
+    })
+}
+
+
+/// Searches the `peptide` in the index multithreaded and performs the taxonomic and functional analyses
+///
+/// # Arguments
+/// * `searcher` - The Searcher which contains the protein database
+/// * `peptide` - The peptide that is being searched in the index
+/// * `cutoff` - The maximum amount of matches we want to process from the index
+/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search
+/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy
+///
+/// # Returns
+///
+/// Returns Some(SearchResultWithAnalysis) if the peptide has matches
+/// Returns None if the peptides does not have any matches, or if the peptide is shorter than the sparseness factor k used in the index
+pub fn analyse_peptide(
+    searcher: &Searcher,
+    peptide: &str,
+    cutoff: usize,
+    equalize_i_and_l: bool,
+    clean_taxa: bool,
+) -> Option<SearchResultWithAnalysis> {
+    let (cutoff_used, mut proteins) =
+        search_proteins_for_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa)?;
+
+    if clean_taxa {
+        proteins.retain(|protein| searcher.taxon_valid(protein))
+    }
+
+    // calculate the lca
+    let lca = if cutoff_used {
+        Some(1)
+    } else {
+        searcher.retrieve_lca(&proteins)
+    };
+
+    // return None if the LCA is none
+    lca?;
+
+    let mut uniprot_accession_numbers = vec![];
+    let mut taxa = vec![];
+
+    for protein in &proteins {
+        taxa.push(protein.taxon_id);
+        uniprot_accession_numbers.push(protein.uniprot_id.clone());
+    }
+
+    let fa = searcher.retrieve_function(&proteins);
+    // output the result
+    Some(SearchResultWithAnalysis {
+        sequence: peptide.to_string(),
+        lca,
+        cutoff_used,
+        uniprot_accession_numbers,
+        taxa,
+        fa,
+    })
+}
+
+
+/// Searches the list of `peptides` in the index multithreaded and performs the functional and taxonomic analyses
+///
+/// # Arguments
+/// * `searcher` - The Searcher which contains the protein database
+/// * `peptides` - List of peptides we want to search in the index
+/// * `cutoff` - The maximum amount of matches we want to process from the index
+/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search
+/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy
+///
+/// # Returns
+///
+/// Returns an `OutputData<SearchResultWithAnalysis>` object with the search and analyses results for the peptides
+pub fn analyse_all_peptides(
+    searcher: &Searcher,
+    peptides: &Vec<String>,
+    cutoff: usize,
+    equalize_i_and_l: bool,
+    clean_taxa: bool,
+) -> OutputData<SearchResultWithAnalysis> {
+    let res: Vec<SearchResultWithAnalysis> = peptides
+        .par_iter()
+        // calculate the results
+        .map(|peptide| analyse_peptide(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa))
+        // remove the None's
+        .filter_map(|search_result| search_result)
+        .collect();
+
+    OutputData { result: res }
+}
+
+/// Searches the list of `peptides` in the index and retrieves all related information about the found proteins
+/// This does NOT perform any of the analyses
+/// 
+/// # Arguments
+/// * `searcher` - The Searcher which contains the protein database
+/// * `peptides` - List of peptides we want to search in the index
+/// * `cutoff` - The maximum amount of matches we want to process from the index
+/// * `equalize_i_and_l` - Boolean indicating if we want to equate I and L during search
+/// * `clean_taxa` - Boolean indicating if we want to filter out proteins that are invalid in the taxonomy
+///
+/// # Returns
+///
+/// Returns an `OutputData<SearchOnlyResult>` object with the search results for the peptides
+pub fn search_all_peptides(
+    searcher: &Searcher,
+    peptides: &Vec<String>,
+    cutoff: usize,
+    equalize_i_and_l: bool,
+    clean_taxa: bool,
+) -> OutputData<SearchOnlyResult> {
+    let res: Vec<SearchOnlyResult> = peptides
+        .par_iter()
+        // calculate the results
+        .map(|peptide| search_peptide_retrieve_annotations(searcher, peptide, cutoff, equalize_i_and_l, clean_taxa))
+        // remove None's
+        .filter_map(|search_result| search_result)
+        .collect();
+
+    OutputData { result: res }
+}
diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
new file mode 100644
index 0000000..6e7b44b
--- /dev/null
+++ b/sa-index/src/sa_searcher.rs
@@ -0,0 +1,506 @@
+use std::cmp::min;
+
+
+use sa_mappings::functionality::{FunctionAggregator, FunctionalAggregation};
+use sa_mappings::proteins::{Protein, Proteins};
+use sa_mappings::taxonomy::TaxonAggregator;
+use umgap::taxon::TaxonId;
+
+use crate::sa_searcher::BoundSearch::{Maximum, Minimum};
+use crate::suffix_to_protein_index::SuffixToProteinIndex;
+use crate::Nullable;
+
+/// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array
+#[derive(Clone, Copy, PartialEq)]
+enum BoundSearch {
+    Minimum,
+    Maximum,
+}
+
+/// Enum representing the minimum and maximum bound of the found matches in the suffix array
+#[derive(PartialEq, Debug)]
+pub enum BoundSearchResult {
+    NoMatches,
+    SearchResult((usize, usize)),
+}
+
+/// Enum representing the matching suffixes after searching a peptide in the suffix array
+/// Both the MaxMatches and SearchResult indicate found suffixes, but MaxMatches is used when the cutoff is reached.
+#[derive(Debug)]
+pub enum SearchAllSuffixesResult {
+    NoMatches,
+    MaxMatches(Vec<i64>),
+    SearchResult(Vec<i64>),
+}
+
+/// Custom implementation of partialEq for SearchAllSuffixesResult
+/// We consider 2 SearchAllSuffixesResult equal if they exist of the same key, and the Vec contains the same values, but the order can be different
+impl PartialEq for SearchAllSuffixesResult {
+    fn eq(&self, other: &Self) -> bool {
+
+        /// Returns true if `arr1` and `arr2` contains the same elements, the order of the elements is ignored
+        ///
+        /// # Arguments
+        /// * `arr1` - The first array used in the comparison
+        /// * `arr2` - The second array used in the comparison
+        ///
+        /// # Returns
+        ///
+        /// Returns true if arr1 and arr2 contains the same elements, the order of the elements is ignored
+        fn array_eq_unordered(arr1: &[i64], arr2: &[i64]) -> bool {
+            let mut arr1_copy = arr1.to_owned();
+            let mut arr2_copy = arr2.to_owned();
+
+            arr1_copy.sort();
+            arr2_copy.sort();
+
+            arr1_copy == arr2_copy
+        }
+
+        match (self, other) {
+            (
+                SearchAllSuffixesResult::MaxMatches(arr1),
+                SearchAllSuffixesResult::MaxMatches(arr2),
+            ) => array_eq_unordered(arr1, arr2),
+            (
+                SearchAllSuffixesResult::SearchResult(arr1),
+                SearchAllSuffixesResult::SearchResult(arr2),
+            ) => array_eq_unordered(arr1, arr2),
+            (SearchAllSuffixesResult::NoMatches, SearchAllSuffixesResult::NoMatches) => true,
+            _ => false,
+        }
+    }
+}
+
+/// Struct that contains all the elements needed to search a peptide in the suffix array
+/// This struct also contains all the functions used for search
+///
+/// # Arguments
+/// * `sa` - The sparse suffix array representing the protein database
+/// * `sparseness_factor` - The sparseness factor used by the suffix array
+/// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix is part of
+/// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the taxonomic analysis provided by Unipept
+/// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate the functional analysis provided by Unipept
+pub struct Searcher {
+    sa: Vec<i64>,
+    pub sparseness_factor: u8,
+    suffix_index_to_protein: Box<dyn SuffixToProteinIndex>,
+    proteins: Proteins,
+    taxon_id_calculator: TaxonAggregator,
+    function_aggregator: FunctionAggregator
+}
+
+impl Searcher {
+    
+    /// Creates a new Searcher object
+    ///
+    /// # Arguments
+    /// * `sa` - The sparse suffix array representing the protein database
+    /// * `sparseness_factor` - The sparseness factor used by the suffix array
+    /// * `suffix_index_to_protein` - Mapping from a suffix to the proteins to know which a suffix is part of
+    /// * `proteins` - List of all the proteins where the suffix array is build on
+    /// * `taxon_id_calculator` - Object representing the used taxonomy and that calculates the taxonomic analysis provided by Unipept
+    /// * `function_aggregator` - Object used to retrieve the functional annotations and to calculate the functional analysis provided by Unipept
+    ///
+    /// # Returns
+    ///
+    /// Returns a new Searcher object
+    pub fn new(
+        sa: Vec<i64>,
+        sparseness_factor: u8,
+        suffix_index_to_protein: Box<dyn SuffixToProteinIndex>,
+        proteins: Proteins,
+        taxon_id_calculator: TaxonAggregator,
+        function_aggregator: FunctionAggregator
+    ) -> Self {
+        Self {
+            sa,
+            sparseness_factor,
+            suffix_index_to_protein,
+            proteins,
+            taxon_id_calculator,
+            function_aggregator
+        }
+    }
+    
+    /// Compares the `search_string` to the `suffix`
+    /// During search this function performs extra logic since the suffix array is build with I == L, while ` self.proteins.input_string` is the original text where I != L
+    ///
+    /// # Arguments
+    /// * `search_string` - The string/peptide being searched in the suffix array
+    /// * `suffix` - The current suffix from the suffix array we are comparing with in the binary search
+    /// * `skip` - How many characters we can skip in the comparison because we already know these match
+    /// * `bound` - Indicates if we are searching for the min of max bound
+    ///
+    /// # Returns
+    ///
+    /// The first argument is true if `bound` == `Minimum` and `search_string` <= `suffix` or if `bound` == `Maximum` and `search_string` >= `suffix`
+    /// The second argument indicates how far the `suffix` and `search_string` matched
+    fn compare(
+        &self,
+        search_string: &[u8],
+        suffix: i64,
+        skip: usize,
+        bound: BoundSearch,
+    ) -> (bool, usize) {
+        let mut index_in_suffix = (suffix as usize) + skip;
+        let mut index_in_search_string = skip;
+        let mut is_cond_or_equal = false;
+
+        // Depending on if we are searching for the min of max bound our condition is different
+        let condition_check = match bound {
+            Minimum => |a: u8, b: u8| a < b,
+            Maximum => |a: u8, b: u8| a > b,
+        };
+
+        // match as long as possible
+        while index_in_search_string < search_string.len()
+            && index_in_suffix < self.proteins.input_string.len()
+            && (search_string[index_in_search_string]
+                == self.proteins.input_string[index_in_suffix]
+                || (search_string[index_in_search_string] == b'L'
+                    && self.proteins.input_string[index_in_suffix] == b'I')
+                || (search_string[index_in_search_string] == b'I'
+                    && self.proteins.input_string[index_in_suffix] == b'L'))
+        {
+            index_in_suffix += 1;
+            index_in_search_string += 1;
+        }
+        // check if match found OR current search string is smaller lexicographically (and the empty search string should not be found)
+        if !search_string.is_empty() {
+            if index_in_search_string == search_string.len() {
+                is_cond_or_equal = true
+            } else if index_in_suffix < self.proteins.input_string.len() {
+                // in our index every L was replaced by a I, so we need to replace them if we want to search in the right direction
+                let peptide_char = if search_string[index_in_search_string] == b'L' {
+                    b'I'
+                } else {
+                    search_string[index_in_search_string]
+                };
+
+                let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' {
+                    b'I'
+                } else {
+                    self.proteins.input_string[index_in_suffix]
+                };
+
+                is_cond_or_equal = condition_check(peptide_char, protein_char);
+            }
+        }
+
+        (is_cond_or_equal, index_in_search_string)
+    }
+    
+    /// Searches for the minimum or maximum bound for a string in the suffix array
+    ///
+    /// # Arguments
+    /// * `bound` - Indicates if we are searching the minimum or maximum bound
+    /// * `search_string` - The string/peptide we are searching in the suffix array
+    ///
+    /// # Returns
+    ///
+    /// The first argument is true if a match was found
+    /// The second argument indicates the index of the minimum or maximum bound for the match (depending on `bound`)
+    fn binary_search_bound(&self, bound: BoundSearch, search_string: &[u8]) -> (bool, usize) {
+        let mut left: usize = 0;
+        let mut right: usize = self.sa.len();
+        let mut lcp_left: usize = 0;
+        let mut lcp_right: usize = 0;
+        let mut found = false;
+
+        // repeat until search window is minimum size OR we matched the whole search string last iteration
+        while right - left > 1 {
+            let center = (left + right) / 2;
+            let skip = min(lcp_left, lcp_right);
+            let (retval, lcp_center) = self.compare(search_string, self.sa[center], skip, bound);
+
+            found |= lcp_center == search_string.len();
+
+            // update the left and right bound, depending on if we are searching the min or max bound
+            if retval && bound == Minimum || !retval && bound == Maximum {
+                right = center;
+                lcp_right = lcp_center;
+            } else {
+                left = center;
+                lcp_left = lcp_center;
+            }
+        }
+
+        // handle edge case to search at index 0
+        if right == 1 && left == 0 {
+            let (retval, lcp_center) =
+                self.compare(search_string, self.sa[0], min(lcp_left, lcp_right), bound);
+
+            found |= lcp_center == search_string.len();
+
+            if bound == Minimum && retval {
+                right = 0;
+            }
+        }
+
+        match bound {
+            Minimum => (found, right),
+            Maximum => (found, left),
+        }
+    }
+
+    /// Searches for the minimum and maximum bound for a string in the suffix array
+    ///
+    /// # Arguments
+    /// * `search_string` - The string/peptide we are searching in the suffix array
+    ///
+    /// # Returns
+    ///
+    /// Returns the minimum and maximum bound of all matches in the suffix array, or `NoMatches` if no matches were found
+    pub fn search_bounds(&self, search_string: &[u8]) -> BoundSearchResult {
+        let (found_min, min_bound) = self.binary_search_bound(Minimum, search_string);
+
+        if !found_min {
+            return BoundSearchResult::NoMatches;
+        }
+
+        let (_, max_bound) = self.binary_search_bound(Maximum, search_string);
+
+        BoundSearchResult::SearchResult((min_bound, max_bound + 1))
+    }
+
+    /// Searches for the suffixes matching a search string
+    /// During search I and L can be equated
+    ///
+    /// # Arguments
+    /// * `search_string` - The string/peptide we are searching in the suffix array
+    /// * `max_matches` - The maximum amount of matches processed, if more matches are found we don't process them
+    /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false
+    ///
+    /// # Returns
+    ///
+    /// Returns all the matching suffixes
+    #[inline]
+    pub fn search_matching_suffixes(
+        &self,
+        search_string: &[u8],
+        max_matches: usize,
+        equalize_i_and_l: bool,
+    ) -> SearchAllSuffixesResult {
+        let mut matching_suffixes: Vec<i64> = vec![];
+        let mut il_locations = vec![];
+        for (i, &character) in search_string.iter().enumerate() {
+            if character == b'I' || character == b'L' {
+                il_locations.push(i);
+            }
+        }
+
+        let mut skip: usize = 0;
+        while skip < self.sparseness_factor as usize {
+            let mut il_locations_start = 0;
+            while il_locations_start < il_locations.len() && il_locations[il_locations_start] < skip {
+                il_locations_start += 1;
+            }
+            let il_locations_current_suffix = &il_locations[il_locations_start..];
+            let current_search_string_prefix = &search_string[..skip];
+            let current_search_string_suffix = &search_string[skip..];
+            let search_bound_result = self.search_bounds(&search_string[skip..]);
+            // if the shorter part is matched, see if what goes before the matched suffix matches the unmatched part of the prefix
+            if let BoundSearchResult::SearchResult((min_bound, max_bound)) = search_bound_result {
+                // try all the partially matched suffixes and store the matching suffixes in an array (stop when our max number of matches is reached)
+                let mut sa_index = min_bound;
+                while sa_index < max_bound {
+                    let suffix = self.sa[sa_index] as usize;
+                    // filter away matches where I was wrongfully equalized to L, and check the unmatched prefix
+                    // when I and L equalized, we only need to check the prefix, not the whole match, when the prefix is 0, we don't need to check at all
+                    if suffix >= skip
+                        && ((skip == 0
+                            || Self::check_prefix(
+                        current_search_string_prefix,
+                                &self.proteins.input_string[suffix - skip..suffix],
+                                equalize_i_and_l,
+                            ))
+                            && Self::check_suffix(
+                                skip,
+                                il_locations_current_suffix,
+                                current_search_string_suffix,
+                                &self.proteins.input_string
+                                    [suffix..suffix + search_string.len() - skip],
+                                equalize_i_and_l,
+                            ))
+                    {
+                        matching_suffixes.push((suffix - skip) as i64);
+
+                        // return if max number of matches is reached
+                        if matching_suffixes.len() >= max_matches {
+                            return SearchAllSuffixesResult::MaxMatches(matching_suffixes);
+                        }
+                    }
+                    sa_index += 1;
+                }
+            }
+            skip += 1;
+        }
+
+        if matching_suffixes.is_empty() {
+            SearchAllSuffixesResult::NoMatches
+        } else {
+            SearchAllSuffixesResult::SearchResult(matching_suffixes)
+        }
+    }
+
+    /// Returns true of the prefixes are the same
+    /// if `equalize_i_and_l` is set to true, L and I are considered the same
+    ///
+    /// # Arguments
+    /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched
+    /// * `index_prefix` - The unchecked prefix from the protein from the suffix array
+    /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false
+    ///
+    /// # Returns
+    ///
+    /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise false
+    #[inline]
+    fn check_prefix(
+        search_string_prefix: &[u8],
+        index_prefix: &[u8],
+        equalize_i_and_l: bool,
+    ) -> bool {
+        if equalize_i_and_l {
+            search_string_prefix.iter().zip(index_prefix).all(
+                |(&search_character, &index_character)| {
+                    search_character == index_character
+                        || (search_character == b'I' && index_character == b'L')
+                        || (search_character == b'L' && index_character == b'I')
+                },
+            )
+        } else {
+            search_string_prefix == index_prefix
+        }
+    }
+
+    /// Returns true of the search_string and index_string are equal
+    /// This is automatically true if `equalize_i_and_l` is set to true, since there matched during search where I = L
+    /// If `equalize_i_and_l` is set to false, we need to check if the I and L locations have the same character
+    ///
+    /// # Arguments
+    /// * `skip` - The used skip factor during the search iteration
+    /// * `il_locations` - The locations of the I's and L's in the **original** peptide
+    /// * `search_string` - The peptide that is being searched, but already with the skipped prefix removed from it
+    /// * `index_string` - The suffix that search_string matches with when I and L were equalized during search
+    /// * `equalize_i_and_l` - True if we want to equate I and L during search, otherwise false
+    ///
+    /// # Returns
+    ///
+    /// Returns true if `search_string` and `index_string` are considered the same, otherwise false
+    fn check_suffix(
+        skip: usize,
+        il_locations: &[usize],
+        search_string: &[u8],
+        index_string: &[u8],
+        equalize_i_and_l: bool,
+    ) -> bool {
+        if equalize_i_and_l {
+            true
+        } else {
+            for &il_location in il_locations {
+                let index = il_location - skip;
+                if search_string[index] != index_string[index] {
+                    return false;
+                }
+            }
+            true
+        }
+    }
+
+    /// Returns all the proteins that correspond with the provided suffixes
+    ///
+    /// # Arguments
+    /// * `suffixes` - List of suffix indices
+    ///
+    /// # Returns
+    ///
+    /// Returns the proteins that every suffix is a part of 
+    #[inline]
+    pub fn retrieve_proteins(&self, suffixes: &Vec<i64>) -> Vec<&Protein> {
+        let mut res = vec![];
+        for &suffix in suffixes {
+            let protein_index = self.suffix_index_to_protein.suffix_to_protein(suffix);
+            if !protein_index.is_null() {
+                res.push(&self.proteins[protein_index as usize]);
+            }
+        }
+        res
+    }
+
+    /// Searches all the matching proteins for a search_string/peptide in the suffix array
+    ///
+    /// # Arguments
+    /// * `search_string` - The string/peptide being searched
+    /// * `equalize_i_and_l` - If set to true, I and L are equalized during search
+    ///
+    /// # Returns
+    ///
+    /// Returns the matching proteins for the search_string
+    pub fn search_proteins_for_peptide(&self, search_string: &[u8], equalize_i_and_l: bool) -> Vec<&Protein> {
+        let mut matching_suffixes = vec![];
+        if let SearchAllSuffixesResult::SearchResult(suffixes) =
+            self.search_matching_suffixes(search_string, usize::MAX, equalize_i_and_l)
+        {
+            matching_suffixes = suffixes;
+        }
+        self.retrieve_proteins(&matching_suffixes)
+    }
+
+    /// Retrieves the taxonomic analysis for a collection of proteins
+    ///
+    /// # Arguments
+    /// * `proteins` - A collection of proteins
+    ///
+    /// # Returns
+    ///
+    /// Returns the taxonomic analysis result for the given list of proteins
+    #[inline]
+    pub fn retrieve_lca(&self, proteins: &[&Protein]) -> Option<TaxonId> {
+        let taxon_ids: Vec<TaxonId> = proteins.iter().map(|prot| prot.taxon_id).collect();
+        
+        self.taxon_id_calculator
+            .aggregate(taxon_ids)
+            .map(|id| self.taxon_id_calculator
+                .snap_taxon(id)
+            )
+    }
+
+    /// Returns true if the protein is considered valid by the provided taxonomy file
+    ///
+    /// # Arguments
+    /// * `protein` - A protein of which we want to check the validity
+    ///
+    /// # Returns
+    ///
+    ///  Returns true if the protein is considered valid by the provided taxonomy file
+    pub fn taxon_valid(&self, protein: &Protein) -> bool {
+        self.taxon_id_calculator.taxon_valid(protein.taxon_id)
+    }
+
+    /// Retrieves the functional analysis for a collection of proteins
+    ///
+    /// # Arguments
+    /// * `proteins` - A collection of proteins
+    ///
+    /// # Returns
+    ///
+    /// Returns the functional analysis result for the given list of proteins
+    pub fn retrieve_function(&self, proteins: &[&Protein]) -> Option<FunctionalAggregation> {
+        let res = self.function_aggregator.aggregate(proteins.to_vec());
+        Some(res)
+    }
+
+    /// Retrieves the all the functional annotations for a collection of proteins
+    ///
+    /// # Arguments
+    /// * `proteins` - A collection of proteins
+    ///
+    /// # Returns
+    ///
+    /// Returns all functional annotations for a collection of proteins
+    pub fn get_all_functional_annotations(&self, proteins: &[&Protein]) -> Vec<Vec<String>> {
+        self.function_aggregator.get_all_functional_annotations(proteins)
+    }
+    
+}
diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs
new file mode 100644
index 0000000..395a51c
--- /dev/null
+++ b/sa-index/src/suffix_to_protein_index.rs
@@ -0,0 +1,159 @@
+use clap::ValueEnum;
+use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+use crate::Nullable;
+
+/// Enum used to define the commandline arguments and choose which index style is used
+#[derive(ValueEnum, Clone, Debug, PartialEq)]
+pub enum SuffixToProteinMappingStyle {
+    Dense,
+    Sparse
+}
+
+/// Trait implemented by the SuffixToProtein mappings
+pub trait SuffixToProteinIndex: Send + Sync {
+    
+    /// Returns the index of the protein in the protein list for the given suffix
+    ///
+    /// # Arguments
+    /// * `suffix` - The suffix of which we want to know of which protein it is a part
+    ///
+    /// # Returns
+    ///
+    /// Returns the index of the protein in the proteins list of which the suffix is a part
+    fn suffix_to_protein(&self, suffix: i64) -> u32;
+}
+
+/// Mapping that uses O(n) memory with n the size of the input text, but retrieval of the protein is in O(1)
+#[derive(Debug, PartialEq)]
+pub struct DenseSuffixToProtein {
+    // UniProtKB does not have more that u32::MAX proteins, so a larger type is not needed
+    mapping: Vec<u32>,
+}
+
+/// Mapping that uses O(m) memory with m the number of proteins, but retrieval of the protein is O(log m)
+#[derive(Debug, PartialEq)]
+pub struct SparseSuffixToProtein {
+    mapping: Vec<i64>,
+}
+
+impl SuffixToProteinIndex for DenseSuffixToProtein {
+    fn suffix_to_protein(&self, suffix: i64) -> u32 {
+        self.mapping[suffix as usize]
+    }
+}
+
+impl SuffixToProteinIndex for SparseSuffixToProtein {
+    fn suffix_to_protein(&self, suffix: i64) -> u32 {
+        let protein_index = self.mapping.binary_search(&suffix).unwrap_or_else(|index| index - 1);
+        // if the next value in the mapping is 1 larger than the current suffix, that means that the current suffix starts with a SEPARATION_CHARACTER or TERMINATION_CHARACTER
+        // this means it does not belong to a protein
+        if self.mapping[protein_index + 1] == suffix + 1 {
+            return u32::NULL
+        }
+        protein_index as u32
+    }
+}
+
+impl DenseSuffixToProtein {
+
+    /// Creates a new DenseSuffixToProtein mapping
+    ///
+    /// # Arguments
+    /// * `text` - The text over which we want to create the mapping
+    ///
+    /// # Returns
+    ///
+    /// Returns a new DenseSuffixToProtein build over the provided text
+    pub fn new(text: &[u8]) -> Self {
+        let mut current_protein_index: u32 = 0;
+        let mut suffix_index_to_protein: Vec<u32> = vec![];
+        for &char in text.iter() {
+            if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
+                current_protein_index += 1;
+                suffix_index_to_protein.push(u32::NULL);
+            } else {
+                assert_ne!(current_protein_index, u32::NULL);
+                suffix_index_to_protein.push(current_protein_index);
+            }
+        }
+        suffix_index_to_protein.shrink_to_fit();
+        DenseSuffixToProtein { mapping: suffix_index_to_protein }
+    }
+}
+
+impl SparseSuffixToProtein {
+
+    /// Creates a new SparseSuffixToProtein mapping
+    ///
+    /// # Arguments
+    /// * `text` - The text over which we want to create the mapping
+    ///
+    /// # Returns
+    ///
+    /// Returns a new SparseSuffixToProtein build over the provided text
+    pub fn new(text: &[u8]) -> Self {
+        let mut suffix_index_to_protein: Vec<i64> = vec![0];
+        for (index, &char) in text.iter().enumerate() {
+            if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
+                suffix_index_to_protein.push(index as i64 + 1);
+            }
+        }
+        suffix_index_to_protein.shrink_to_fit();
+        SparseSuffixToProtein { mapping: suffix_index_to_protein }
+    }
+
+}
+
+
+#[cfg(test)]
+mod tests {
+    use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+    use crate::Nullable;
+    use crate::suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex};
+
+    fn build_text() -> Vec<u8> {
+        let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char));
+        text.push(TERMINATION_CHARACTER as char);
+        text.into_bytes()
+    }
+
+    #[test]
+    fn test_dense_build() {
+        let u8_text = &build_text();
+        let index = DenseSuffixToProtein::new(u8_text);
+        let expected = DenseSuffixToProtein {mapping: vec![0, 0, 0, u32::NULL, 1, 1, u32::NULL, 2, 2, 2, u32::NULL]};
+        assert_eq!(index, expected);
+    }
+
+    #[test]
+    fn test_sparse_build() {
+        let u8_text = &build_text();
+        let index = SparseSuffixToProtein::new(u8_text);
+        let expected = SparseSuffixToProtein {mapping: vec![0, 4, 7, 11]};
+        assert_eq!(index, expected);
+    }
+
+    #[test]
+    fn test_search_dense() {
+        let u8_text = &build_text();
+        let index = DenseSuffixToProtein::new(u8_text);
+        assert_eq!(index.suffix_to_protein(5), 1);
+        assert_eq!(index.suffix_to_protein(7), 2);
+        // suffix that starts with SEPARATION_CHARACTER
+        assert_eq!(index.suffix_to_protein(3), u32::NULL);
+        // suffix that starts with TERMINATION_CHARACTER
+        assert_eq!(index.suffix_to_protein(10), u32::NULL);
+    }
+
+    #[test]
+    fn test_search_sparse() {
+        let u8_text = &build_text();
+        let index = SparseSuffixToProtein::new(u8_text);
+        assert_eq!(index.suffix_to_protein(5), 1);
+        assert_eq!(index.suffix_to_protein(7), 2);
+        // suffix that starts with SEPARATION_CHARACTER
+        assert_eq!(index.suffix_to_protein(3), u32::NULL);
+        // suffix that starts with TERMINATION_CHARACTER
+        assert_eq!(index.suffix_to_protein(10), u32::NULL);
+    }
+}
\ No newline at end of file
diff --git a/sa-index/src/util.rs b/sa-index/src/util.rs
new file mode 100644
index 0000000..821bdc7
--- /dev/null
+++ b/sa-index/src/util.rs
@@ -0,0 +1,55 @@
+use std::fs::File;
+use std::io;
+use std::io::BufRead;
+use std::path::Path;
+use std::time::{SystemTime, SystemTimeError, UNIX_EPOCH};
+
+use crate::sa_searcher::Searcher;
+
+/// Gets the current time in ms
+///
+/// # Returns
+///
+/// Returns the current time in ms
+/// 
+/// # Errors
+/// 
+/// Returns a SystemTimeError if getting the current time somehow fails
+#[allow(unused)]
+pub fn get_time_ms() -> Result<f64, SystemTimeError> {
+    Ok(SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos() as f64 * 1e-6)
+}
+
+/// Times how long the function `f`, that has the searcher as only argument executed
+///
+/// # Returns
+///
+/// Returns the execution time of `f`in ms
+///
+/// # Errors
+///
+/// Returns a SystemTimeError if getting the start or end time failed
+#[allow(unused)]
+pub fn time_execution(
+    searcher: &mut Searcher,
+    f: &dyn Fn(&mut Searcher) -> bool,
+) -> Result<(bool, f64), SystemTimeError> {
+    let start_ms = get_time_ms()?;
+    let found = f(searcher);
+    let end_ms = get_time_ms()?;
+    Ok((found, end_ms - start_ms))
+}
+
+/// Opens `filename` and creates an iterator over it per line
+///
+/// # Arguments
+/// * `filename` - The file we want to iterate over per line
+/// 
+/// # Returns
+///
+/// Returns an Iterator to the Reader of the lines of the file.
+pub fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
+    where P: AsRef<Path>, {
+    let file = File::open(filename)?;
+    Ok(io::BufReader::new(file).lines())
+}
diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml
index f02934a..6573fb0 100644
--- a/sa-mappings/Cargo.toml
+++ b/sa-mappings/Cargo.toml
@@ -12,3 +12,6 @@ tempdir = "0.3.7"
 fa-compression = { path = "../fa-compression" }
 bytelines = "2.5.0"
 umgap = "1.1.0"
+serde_json = "1.0.115"
+serde = { version = "1.0.197", features = ["derive"] }
+
diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs
index b26152b..b4c9c14 100644
--- a/sa-mappings/src/functionality.rs
+++ b/sa-mappings/src/functionality.rs
@@ -1,8 +1,21 @@
 //! This module contains the FunctionAggregator struct that is responsible for aggregating the
 //! functional annotations of proteins.
 
+use std::collections::{HashMap, HashSet};
+use serde::Serialize;
+
+
 use crate::proteins::Protein;
 
+/// A struct that represents the functional annotations once aggregated
+#[derive(Debug, Serialize)]
+pub struct FunctionalAggregation {
+    /// A HashMap representing how many GO, EC and IPR terms were found
+    pub counts: HashMap<String, usize>,
+    /// A HashMap representing how often a certain functional annotation was found
+    pub data: HashMap<String, u32>,
+}
+
 /// A struct that represents a function aggregator
 pub struct FunctionAggregator {}
 
@@ -14,46 +27,59 @@ impl FunctionAggregator {
     ///
     /// # Returns
     ///
-    /// Returns a string containing the aggregated functional annotations
-    pub fn aggregate(&self, proteins: Vec<Protein>) -> String {
-        proteins
-            .iter()
-            .map(|protein| protein.get_functional_annotations())
-            .collect::<Vec<String>>()
-            .join(";")
+    /// Returns a JSON string containing the aggregated functional annotations
+    pub fn aggregate(&self, proteins: Vec<&Protein>) -> FunctionalAggregation {
+        // Keep track of the proteins that have a certain annotation
+        let mut proteins_with_ec: HashSet<String> = HashSet::new();
+        let mut proteins_with_go: HashSet<String> = HashSet::new();
+        let mut proteins_with_ipr: HashSet<String> = HashSet::new();
+
+        // Keep track of the counts of the different annotations
+        let mut data: HashMap<String, u32> = HashMap::new();
+
+        for protein in proteins.iter() {
+            for annotation in protein.get_functional_annotations().split(';') {
+                match annotation.chars().next() {
+                    Some('E') => proteins_with_ec.insert(protein.uniprot_id.clone()),
+                    Some('G') => proteins_with_go.insert(protein.uniprot_id.clone()),
+                    Some('I') => proteins_with_ipr.insert(protein.uniprot_id.clone()),
+                    _ => false
+                };
+
+                data.entry(annotation.to_string()).and_modify(|c| *c += 1).or_insert(1);
+            }
+        }
+
+        let mut counts: HashMap<String, usize> = HashMap::new();
+        counts.insert("all".to_string(), proteins.len());
+        counts.insert("EC".to_string(), proteins_with_ec.len());
+        counts.insert("GO".to_string(), proteins_with_go.len());
+        counts.insert("IPR".to_string(), proteins_with_ipr.len());
+
+        data.remove("");
+
+        FunctionalAggregation { counts, data }
     }
-}
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_aggregate() {
-        let proteins = vec![
-            Protein {
-                uniprot_id:             "uniprot1".to_string(),
-                sequence:               (0, 3),
-                taxon_id:               1,
-                functional_annotations: vec![
-                    0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27,
-                ]
-            },
-            Protein {
-                uniprot_id:             "uniprot2".to_string(),
-                sequence:               (4, 3),
-                taxon_id:               2,
-                functional_annotations: vec![
-                    0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27,
-                ]
-            },
-        ];
-
-        let function_aggregator = FunctionAggregator {};
-
-        assert_eq!(
-            function_aggregator.aggregate(proteins),
-            "GO:0009279;IPR:IPR016364;IPR:IPR008816;GO:0009279;IPR:IPR016364;IPR:IPR008816"
-        );
+
+    /// Aggregates the functional annotations of proteins
+    ///
+    /// # Arguments
+    /// * `proteins` - A vector of proteins
+    ///
+    /// # Returns
+    ///
+    /// Returns a list of lists with all the functional annotations per protein
+    pub fn get_all_functional_annotations(&self, proteins: &[&Protein]) -> Vec<Vec<String>> {
+        proteins
+            .iter()
+            .map(
+                |&prot|
+                    prot.get_functional_annotations()
+                        .split(';')
+                        .map(|ann| ann.to_string())
+                        .collect()
+            )
+            .collect::<Vec<Vec<String>>>()
     }
 }
diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs
index a79dd3d..cda9b94 100644
--- a/sa-mappings/src/proteins.rs
+++ b/sa-mappings/src/proteins.rs
@@ -27,9 +27,6 @@ pub struct Protein {
     /// The id of the protein
     pub uniprot_id: String,
 
-    /// start position and length of the protein in the input string
-    pub sequence: (usize, u32),
-
     /// the taxon id of the protein
     pub taxon_id: TaxonId,
 
@@ -40,7 +37,7 @@ pub struct Protein {
 /// A struct that represents a collection of proteins
 pub struct Proteins {
     /// The input string containing all proteins
-    input_string: Vec<u8>,
+    pub input_string: Vec<u8>,
 
     /// The proteins in the input string
     proteins: Vec<Protein>
@@ -76,8 +73,6 @@ impl Proteins {
 
         let file = File::open(file)?;
 
-        let mut start_index = 0;
-
         // Read the lines as bytes, since the input string is not guaranteed to be utf8
         // because of the encoded functional annotations
         let mut lines = ByteLines::new(BufReader::new(file));
@@ -100,38 +95,68 @@ impl Proteins {
 
             proteins.push(Protein {
                 uniprot_id: uniprot_id.to_string(),
-                sequence: (start_index, sequence.len() as u32),
                 taxon_id,
                 functional_annotations
             });
 
-            start_index += sequence.len() + 1;
         }
 
         input_string.pop();
         input_string.push(TERMINATION_CHARACTER.into());
-
+        input_string.shrink_to_fit();
+        proteins.shrink_to_fit();
         Ok(Self {
             input_string: input_string.into_bytes(),
             proteins
         })
     }
 
-    /// Returns the sequence of a protein
+    /// Creates a `vec<u8>` which represents all the proteins concatenated from the database file
     ///
     /// # Arguments
-    /// * `protein` - The protein to get the sequence from
+    /// * `file` - The path to the database file
+    /// * `taxon_aggregator` - The `TaxonAggregator` to use
     ///
     /// # Returns
     ///
-    /// Returns a string slice containing the sequence of the protein
-    pub fn get_sequence(&self, protein: &Protein) -> &str {
-        let (start, length) = protein.sequence;
-        let end = start + length as usize;
+    /// Returns a `Result` containing the `Vec<u8>`
+    ///
+    /// # Errors
+    ///
+    /// Returns a `Box<dyn Error>` if an error occurred while reading the database file
+    pub fn try_from_database_file_without_annotations(database_file: &str, taxon_aggregator: &TaxonAggregator) -> Result<Vec<u8>, Box<dyn Error>> {
+        let mut input_string: String = String::new();
+
+        let file = File::open(database_file)?;
+
+        // Read the lines as bytes, since the input string is not guaranteed to be utf8
+        // because of the encoded functional annotations
+        let mut lines = ByteLines::new(BufReader::new(file));
+
+        while let Some(Ok(line)) = lines.next() {
+            let mut fields = line.split(|b| *b == b'\t');
+
+            // only get the taxon id and sequence from each line, we don't need the other parts
+            fields.next();
+            let taxon_id = from_utf8(fields.next().unwrap())?.parse::<TaxonId>()?;
+            let sequence = from_utf8(fields.next().unwrap())?;
+            fields.next();
+
+            if !taxon_aggregator.taxon_exists(taxon_id) {
+                continue;
+            }
+
+            input_string.push_str(&sequence.to_uppercase());
+            input_string.push(SEPARATION_CHARACTER.into());
+        }
+
+        input_string.pop();
+        input_string.push(TERMINATION_CHARACTER.into());
 
-        // unwrap should never fail since the input string will always be utf8
-        std::str::from_utf8(&self.input_string[start .. end]).unwrap()
+        input_string.shrink_to_fit();
+        Ok(input_string.into_bytes())
     }
+
 }
 
 impl Index<usize> for Proteins {
@@ -210,13 +235,11 @@ mod tests {
     fn test_new_protein() {
         let protein = Protein {
             uniprot_id:             "P12345".to_string(),
-            sequence:               (0, 3),
             taxon_id:               1,
             functional_annotations: vec![0xD1, 0x11]
         };
 
         assert_eq!(protein.uniprot_id, "P12345");
-        assert_eq!(protein.sequence, (0, 3));
         assert_eq!(protein.taxon_id, 1);
         assert_eq!(protein.functional_annotations, vec![0xD1, 0x11]);
     }
@@ -230,13 +253,11 @@ mod tests {
             proteins:     vec![
                 Protein {
                     uniprot_id:             "P12345".to_string(),
-                    sequence:               (0, 3),
                     taxon_id:               1,
                     functional_annotations: vec![0xD1, 0x11]
                 },
                 Protein {
                     uniprot_id:             "P54321".to_string(),
-                    sequence:               (4, 3),
                     taxon_id:               2,
                     functional_annotations: vec![0xD1, 0x11]
                 },
@@ -249,19 +270,17 @@ mod tests {
         );
         assert_eq!(proteins.proteins.len(), 2);
         assert_eq!(proteins.proteins[0].uniprot_id, "P12345");
-        assert_eq!(proteins.proteins[0].sequence, (0, 3));
         assert_eq!(proteins.proteins[0].taxon_id, 1);
         assert_eq!(proteins.proteins[0].functional_annotations, vec![0xD1, 0x11]);
         assert_eq!(proteins.proteins[1].uniprot_id, "P54321");
-        assert_eq!(proteins.proteins[1].sequence, (4, 3));
         assert_eq!(proteins.proteins[1].taxon_id, 2);
         assert_eq!(proteins.proteins[1].functional_annotations, vec![0xD1, 0x11]);
     }
 
     #[test]
-    fn test_get_sequence() {
+    fn test_get_taxon() {
         // Create a temporary directory for this test
-        let tmp_dir = TempDir::new("test_get_sequences").unwrap();
+        let tmp_dir = TempDir::new("test_get_taxon").unwrap();
 
         let database_file = create_database_file(&tmp_dir);
         let taxonomy_file = create_taxonomy_file(&tmp_dir);
@@ -270,25 +289,21 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::Lca
         )
-        .unwrap();
+            .unwrap();
         let proteins =
             Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator)
                 .unwrap();
 
-        //assert_eq!(proteins.proteins.len(), 4);
-        assert_eq!(proteins.get_sequence(&proteins[0]), "MLPGLALLLLAAWTARALEV");
-        assert_eq!(proteins.get_sequence(&proteins[1]), "PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG");
-        assert_eq!(proteins.get_sequence(&proteins[2]), "KWDSDPSGTKTCIDT");
-        assert_eq!(
-            proteins.get_sequence(&proteins[3]),
-            "KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH"
-        );
+        let taxa = vec![1, 2, 6, 17];
+        for (i, protein) in proteins.proteins.iter().enumerate() {
+            assert_eq!(protein.taxon_id, taxa[i]);
+        }
     }
 
     #[test]
-    fn test_get_taxon() {
+    fn test_get_functional_annotations() {
         // Create a temporary directory for this test
-        let tmp_dir = TempDir::new("test_get_taxon").unwrap();
+        let tmp_dir = TempDir::new("test_get_fa").unwrap();
 
         let database_file = create_database_file(&tmp_dir);
         let taxonomy_file = create_taxonomy_file(&tmp_dir);
@@ -297,19 +312,21 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::Lca
         )
-        .unwrap();
+            .unwrap();
         let proteins =
             Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator)
                 .unwrap();
 
-        let taxa = vec![1, 2, 6, 17];
-        for (i, protein) in proteins.proteins.iter().enumerate() {
-            assert_eq!(protein.taxon_id, taxa[i]);
+        for protein in proteins.proteins.iter() {
+            assert_eq!(
+                decode(&protein.functional_annotations),
+                "GO:0009279;IPR:IPR016364;IPR:IPR008816"
+            );
         }
     }
 
     #[test]
-    fn test_get_functional_annotations() {
+    fn test_get_concatenated_proteins() {
         // Create a temporary directory for this test
         let tmp_dir = TempDir::new("test_get_fa").unwrap();
 
@@ -320,16 +337,14 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::Lca
         )
-        .unwrap();
+            .unwrap();
         let proteins =
-            Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator)
+            Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap(), &taxon_aggregator)
                 .unwrap();
 
-        for protein in proteins.proteins.iter() {
-            assert_eq!(
-                decode(&protein.functional_annotations),
-                "GO:0009279;IPR:IPR016364;IPR:IPR008816"
-            );
-        }
+        let sep_char = SEPARATION_CHARACTER as char;
+        let end_char = TERMINATION_CHARACTER as char;
+        let expected = format!("MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}", sep_char, sep_char, sep_char, end_char);
+        assert_eq!(proteins, expected.as_bytes());
     }
 }
diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs
index ada93ff..9ba1e26 100644
--- a/sa-mappings/src/taxonomy.rs
+++ b/sa-mappings/src/taxonomy.rs
@@ -91,6 +91,23 @@ impl TaxonAggregator {
         self.taxon_list.get(taxon).is_some()
     }
 
+    /// Checks if a taxon is valid to be used during taxonomic aggregation
+    ///
+    /// # Arguments
+    ///
+    /// * `taxon` - The taxon ID to check.
+    ///
+    /// # Returns
+    ///
+    /// Returns a boolean value indicating whether the taxon exists and is valid
+    pub fn taxon_valid(&self, taxon: TaxonId) -> bool {
+        let optional_taxon = self.taxon_list.get(taxon);
+        match optional_taxon {
+            None => false,
+            Some(taxon) => taxon.valid
+        }
+    }
+
     /// Snaps a taxon to its closest ancestor in the taxonomic tree.
     ///
     /// # Arguments
@@ -109,15 +126,22 @@ impl TaxonAggregator {
     /// # Arguments
     ///
     /// * `taxa` - A vector of taxon IDs to aggregate.
+    /// * `clean_taxa` - If true, only the taxa which are stored as "valid" are used during aggregation
     ///
     /// # Returns
     ///
-    /// Returns the aggregated taxon ID, or panics if aggregation fails.
-    pub fn aggregate(&self, taxa: Vec<TaxonId>) -> TaxonId {
-        let count = count(taxa.into_iter().map(|t| (t, 1.0)));
-        self.aggregator
+    /// Returns the aggregated taxon ID wrapped in Some if aggregation succeeds,
+    /// Returns None if the list of taxa to aggregate is emtpy,
+    /// Panics if aggregation fails.
+    pub fn aggregate(&self, taxa: Vec<TaxonId>, ) -> Option<TaxonId> {
+        if taxa.is_empty() {
+            return None
+        }
+
+        let count = count(taxa.into_iter().map(|t| (t, 1.0_f32)));
+        Some(self.aggregator
             .aggregate(&count)
-            .unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count))
+            .unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count)))
     }
 }
 
@@ -151,6 +175,7 @@ mod tests {
         writeln!(file, "18\tPelobacter\tgenus\t17\t\x01").unwrap();
         writeln!(file, "19\tSyntrophotalea carbinolica\tspecies\t17\t\x01").unwrap();
         writeln!(file, "20\tPhenylobacterium\tgenus\t19\t\x01").unwrap();
+        writeln!(file, "21\tInvalid\tspecies\t19\t\x00").unwrap();
 
         taxonomy_file
     }
@@ -166,12 +191,12 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::Lca
         )
-        .unwrap();
+            .unwrap();
         let _ = TaxonAggregator::try_from_taxonomy_file(
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::LcaStar
         )
-        .unwrap();
+            .unwrap();
     }
 
     #[test]
@@ -185,7 +210,7 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::Lca
         )
-        .unwrap();
+            .unwrap();
 
         for i in 0 ..= 20 {
             if [0, 3, 4, 5, 8, 12, 15].contains(&i) {
@@ -207,7 +232,7 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::Lca
         )
-        .unwrap();
+            .unwrap();
 
         for i in 0 ..= 20 {
             if ![0, 3, 4, 5, 8, 12, 15].contains(&i) {
@@ -227,11 +252,11 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::Lca
         )
-        .unwrap();
+            .unwrap();
 
-        assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), 6);
-        assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), 10);
-        assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), 17);
+        assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), Some(6));
+        assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), Some(10));
+        assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), Some(17));
     }
 
     #[test]
@@ -245,10 +270,10 @@ mod tests {
             taxonomy_file.to_str().unwrap(),
             AggregationMethod::LcaStar
         )
-        .unwrap();
+            .unwrap();
 
-        assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), 6);
-        assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), 10);
-        assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), 19);
+        assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), Some(6));
+        assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), Some(10));
+        assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), Some(19));
     }
 }
diff --git a/sa-server/Cargo.toml b/sa-server/Cargo.toml
index 18698eb..c1920f5 100644
--- a/sa-server/Cargo.toml
+++ b/sa-server/Cargo.toml
@@ -6,3 +6,10 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+axum = { version = "0.7.4", features = ["macros"] }
+serde = { version = "1.0.197", features = ["derive"] }
+tokio = { version = "1.36.0", features = ["rt", "rt-multi-thread", "macros"] }
+sa-index = { path = "../sa-index" }
+clap = { version = "4.5.1", features = ["derive"] }
+sa-builder = { path = "../sa-builder" }
+sa-mappings = { path = "../sa-mappings" }
\ No newline at end of file
diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs
index e7a11a9..52bb657 100644
--- a/sa-server/src/main.rs
+++ b/sa-server/src/main.rs
@@ -1,3 +1,184 @@
-fn main() {
-    println!("Hello, world!");
+use std::error::Error;
+use std::sync::Arc;
+
+use axum::{http::StatusCode, Json, Router};
+use axum::extract::{DefaultBodyLimit, State};
+use axum::routing::{get, post};
+use clap::Parser;
+use serde::{Deserialize, Serialize};
+
+use sa_mappings::functionality::FunctionAggregator;
+use sa_mappings::proteins::Proteins;
+use sa_mappings::taxonomy::{AggregationMethod, TaxonAggregator};
+use sa_index::peptide_search::{OutputData, analyse_all_peptides, SearchResultWithAnalysis, SearchOnlyResult, search_all_peptides};
+use sa_index::sa_searcher::Searcher;
+use sa_index::suffix_to_protein_index::SparseSuffixToProtein;
+use sa_builder::binary::load_suffix_array;
+
+/// Enum that represents all possible commandline arguments
+#[derive(Parser, Debug)]
+pub struct Arguments {
+    /// File with the proteins used to build the suffix tree. All the proteins are expected to be concatenated using a `#`.
+    #[arg(short, long)]
+    database_file: String,
+    #[arg(short, long)]
+    index_file: String,
+    #[arg(short, long)]
+    /// The taxonomy to be used as a tsv file. This is a preprocessed version of the NCBI taxonomy.
+    taxonomy: String,
+}
+
+/// Function used by serde to place a default value in the cutoff field of the input
+fn default_cutoff() -> usize {
+    10000
+}
+
+/// Function used by serde to use `true` as a default value
+#[allow(dead_code)]
+fn default_true() -> bool {
+    true
+}
+
+/// Struct representing the input arguments accepted by the endpoints
+/// 
+/// # Arguments
+/// * `peptides` - List of peptides we want to process
+/// * `cutoff` - The maximum amount of matches to process, default value 10000
+/// * `equalize_I_and_L` - True if we want to equalize I and L during search
+/// * `clean_taxa` - True if we only want to use proteins marked as "valid"
+#[derive(Debug, Deserialize, Serialize)]
+#[allow(non_snake_case)]
+struct InputData {
+    peptides: Vec<String>,
+    #[serde(default = "default_cutoff")] // default value is 10000
+    cutoff: usize,
+    #[serde(default = "bool::default")]
+    // default value is false // TODO: maybe default should be true?
+    equalize_I_and_L: bool,
+    #[serde(default = "bool::default")] // default value is false
+    clean_taxa: bool,
+}
+
+#[tokio::main]
+async fn main() {
+    let args = Arguments::parse();
+    if let Err(err) = start_server(args).await {
+        eprintln!("{}", err);
+        std::process::exit(1);
+    }
+}
+
+/// Basic handler used to check the server status
+async fn root() -> &'static str {
+    "Server is online"
+}
+
+/// Endpoint executed for peptide matching and taxonomic and functional analysis
+///
+/// # Arguments
+/// * `state(searcher)` - The searcher object provided by the server
+/// * `data` - InputData object provided by the user with the peptides to be searched and the config
+/// 
+/// # Returns
+///
+/// Returns the search and analysis results from the index as a JSON
+async fn analyse(
+    State(searcher): State<Arc<Searcher>>,
+    data: Json<InputData>,
+) -> Result<Json<OutputData<SearchResultWithAnalysis>>, StatusCode> {
+    let search_result = analyse_all_peptides(
+        &searcher,
+        &data.peptides,
+        data.cutoff,
+        data.equalize_I_and_L,
+        data.clean_taxa,
+    );
+
+    Ok(Json(search_result))
+}
+
+/// Endpoint executed for peptide matching, without any analysis
+///
+/// # Arguments
+/// * `state(searcher)` - The searcher object provided by the server
+/// * `data` - InputData object provided by the user with the peptides to be searched and the config
+///
+/// # Returns
+///
+/// Returns the search results from the index as a JSON
+async fn search(
+    State(searcher): State<Arc<Searcher>>,
+    data: Json<InputData>,
+) -> Result<Json<OutputData<SearchOnlyResult>>, StatusCode> {
+    let search_result = search_all_peptides(
+        &searcher,
+        &data.peptides,
+        data.cutoff,
+        data.equalize_I_and_L,
+        data.clean_taxa,
+    );
+
+    Ok(Json(search_result))
+}
+
+/// Starts the server with the provided commandline arguments
+///
+/// # Arguments
+/// * `args` - The provided commandline arguments
+///
+/// # Returns
+///
+/// Returns ()
+/// 
+/// # Errors
+/// 
+/// Returns any error occurring during the startup or uptime of the server
+async fn start_server(args: Arguments) -> Result<(), Box<dyn Error>> {
+    let Arguments {
+        database_file,
+        index_file,
+        taxonomy,
+    } = args;
+
+    eprintln!("Loading suffix array...");
+    let (sparseness_factor, sa) = load_suffix_array(&index_file)?;
+
+    eprintln!("Loading taxon file...");
+    let taxon_id_calculator =
+        TaxonAggregator::try_from_taxonomy_file(&taxonomy, AggregationMethod::LcaStar)?;
+
+    let function_aggregator = FunctionAggregator {};
+
+    eprintln!("Loading proteins...");
+    let proteins = Proteins::try_from_database_file(&database_file, &taxon_id_calculator)?;
+    let suffix_index_to_protein = Box::new(SparseSuffixToProtein::new(&proteins.input_string));
+
+    eprintln!("Creating searcher...");
+    let searcher = Arc::new(Searcher::new(
+        sa,
+        sparseness_factor,
+        suffix_index_to_protein,
+        proteins,
+        taxon_id_calculator,
+        function_aggregator,
+    ));
+
+    // build our application with a route
+    let app = Router::new()
+        // `GET /` goes to `root`
+        .route("/", get(root))
+        // `POST /analyse` goes to `analyse` and set max payload size to 5 MB
+        .route("/analyse", post(analyse))
+        .layer(DefaultBodyLimit::max(5 * 10_usize.pow(6)))
+        .with_state(searcher.clone())
+        // `POST /search` goes to `search` and set max payload size to 5 MB
+        .route("/search", post(search))
+        .layer(DefaultBodyLimit::max(5 * 10_usize.pow(6)))
+        .with_state(searcher);
+
+    let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await?;
+    println!("server is ready...");
+    axum::serve(listener, app).await?;
+
+    Ok(())
 }