diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..a9084f7
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
+{
+  "name": "Unipept Index",
+  "image": "mcr.microsoft.com/devcontainers/base:ubuntu",
+
+  // Features to add to the dev container. More info: https://containers.dev/features.
+  "features": {
+    "ghcr.io/devcontainers/features/rust:1": {}
+  },
+
+  // Use 'forwardPorts' to make a list of ports inside the container available locally.
+  // "forwardPorts": [],
+
+  // Use 'postCreateCommand' to run commands after the container is created.
+  // "postCreateCommand": "",
+
+  // Configure tool-specific properties.
+  // "customizations": {},
+
+  // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+  // "remoteUser": "root"
+}
diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml
new file mode 100644
index 0000000..2b12566
--- /dev/null
+++ b/.github/workflows/build_index.yml
@@ -0,0 +1,82 @@
+name: Build index binaries
+
+on:
+  schedule:
+    # Run on the first day of every month at midnight UTC
+    - cron: '0 0 1 * *'
+  push:
+    branches:
+      - feature/build_index_action
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      # Check out the most recent version of the repository with submodules
+      - name: Check out repository
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      # Set up Rust toolchain
+      - name: Set up Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      # Compile Rust code
+      - name: Compile Rust code
+        run: cargo build --release
+
+      # Create a directory "build"
+      - name: Create build directory
+        run: mkdir -p build/input
+
+      # Download the file "suffix-array.zip" from the most recent release of "unipept-database"
+      - name: Download suffix-array.zip
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          latest_release_url=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep "browser_download_url.*suffix-array.zip" | cut -d '"' -f 4)
+          release_date=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep '"published_at":' | cut -d '"' -f 4 | cut -d'T' -f1)
+          release_date_formatted=$(date -d $release_date "+%Y-%m-%d")
+          SP_VERSION="SP_$release_date_formatted"
+          echo "SP_VERSION=$SP_VERSION" >> $GITHUB_ENV
+          curl -L -o build/suffix-array.zip $latest_release_url
+
+      # Extract the contents of the output folder from the zip into a folder "build/input"
+      - name: Extract zip contents
+        run: |
+          unzip build/suffix-array.zip '*' -d build/input
+
+      # Make a directory with the SP_VERSION and process files
+      - name: Process files
+        run: |
+          mkdir -p build/$SP_VERSION
+          lz4 -d build/input/uniprot_entries.tsv.lz4 | cut -f2,4,7,8 > build/$SP_VERSION/proteins.tsv
+          lz4 -d build/input/taxons.tsv.lz4 > build/$SP_VERSION/taxons.tsv
+
+      # Step 8: Run the sa-builder command
+      - name: Run sa-builder
+        run: |
+          prefix="build/$SP_VERSION"
+          ./target/release/sa-builder -d "$prefix/proteins.tsv" -o "$prefix/sa_sparse3_compressed.bin" -s 3 -a lib-div-suf-sort -c
+          
+      # Zip the contents of the build/$SP_VERSION directory
+      - name: Zip build contents
+        run: |
+          cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin"
+
+      # Create a GitHub release and upload the zip file
+      - name: Upload or Update Release
+        id: upload_or_update_release
+        uses: softprops/action-gh-release@v1
+        with:
+          files: build/${{ env.SP_VERSION }}/index_${{ env.SP_VERSION }}.zip
+          tag_name: index-${{ env.SP_VERSION }}
+          name: Index ${{ env.SP_VERSION }}
+          commitish: ${{ github.sha }}
+          draft: false
+          prerelease: false
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..92b44ab
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/unipept-index.iml" filepath="$PROJECT_DIR$/unipept-index.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index c29abc3..9d81263 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1127,15 +1127,18 @@ dependencies = [
  "serde",
  "serde_json",
  "tempdir",
+ "text-compression",
 ]
 
 [[package]]
 name = "sa-mappings"
 version = "0.1.0"
 dependencies = [
+ "bitarray",
  "bytelines",
  "fa-compression",
  "tempdir",
+ "text-compression",
 ]
 
 [[package]]
@@ -1275,6 +1278,13 @@ dependencies = [
  "remove_dir_all",
 ]
 
+[[package]]
+name = "text-compression"
+version = "0.1.0"
+dependencies = [
+ "bitarray",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs
index e7265cd..a8084d1 100644
--- a/bitarray/src/binary.rs
+++ b/bitarray/src/binary.rs
@@ -159,10 +159,10 @@ mod tests {
     #[test]
     fn test_write_binary() {
         let mut bitarray = BitArray::with_capacity(4, 40);
-        bitarray.set(0, 0x1234567890);
-        bitarray.set(1, 0xabcdef0123);
-        bitarray.set(2, 0x4567890abc);
-        bitarray.set(3, 0xdef0123456);
+        bitarray.set(0, 0x1234567890_u64);
+        bitarray.set(1, 0xabcdef0123_u64);
+        bitarray.set(2, 0x4567890abc_u64);
+        bitarray.set(3, 0xdef0123456_u64);
 
         let mut buffer = Vec::new();
         bitarray.write_binary(&mut buffer).unwrap();
diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs
index 655d17e..901b395 100644
--- a/bitarray/src/lib.rs
+++ b/bitarray/src/lib.rs
@@ -85,6 +85,7 @@ impl BitArray {
     /// * `index` - The index of the value to set.
     /// * `value` - The value to set at the specified index.
     pub fn set(&mut self, index: usize, value: u64) {
+        let value: u64 = value;
         let start_block = index * self.bits_per_value / 64;
         let start_block_offset = index * self.bits_per_value % 64;
 
@@ -142,11 +143,14 @@ impl BitArray {
     pub fn clear(&mut self) {
         self.data.iter_mut().for_each(|x| *x = 0);
     }
+
+    pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] {
+        &self.data[start_slice..end_slice]
+    }
 }
 
-/// Writes the data to a writer in a binary format using a bit array. This function is helpfull
-/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the
-/// specified capacity, so memory usage is minimized.
+/// Writes the data to a writer in a binary format using a bit array. The data is written
+/// in chunks of the specified capacity, so memory usage is minimized.
 ///
 /// # Arguments
 ///
@@ -257,10 +261,10 @@ mod tests {
     fn test_bitarray_set() {
         let mut bitarray = BitArray::with_capacity(4, 40);
 
-        bitarray.set(0, 0b0001110011111010110001000111111100110010);
-        bitarray.set(1, 0b1100001001010010011000010100110111001001);
-        bitarray.set(2, 0b1111001101001101101101101011101001010001);
-        bitarray.set(3, 0b0000100010010001010001001110101110011100);
+        bitarray.set(0, 0b0001110011111010110001000111111100110010_u64);
+        bitarray.set(1, 0b1100001001010010011000010100110111001001_u64);
+        bitarray.set(2, 0b1111001101001101101101101011101001010001_u64);
+        bitarray.set(3, 0b0000100010010001010001001110101110011100_u64);
 
         assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]);
     }
diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs
index 98a1414..01cc3c4 100644
--- a/sa-builder/src/main.rs
+++ b/sa-builder/src/main.rs
@@ -21,7 +21,7 @@ fn main() {
     eprintln!();
     eprintln!("📋 Started loading the proteins...");
     let start_proteins_time = get_time_ms().unwrap();
-    let mut data = Proteins::try_from_database_file_without_annotations(&database_file)
+    let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
         .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
     eprintln!(
         "✅ Successfully loaded the proteins in {} seconds!",
diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml
index de57fc9..25dda76 100644
--- a/sa-index/Cargo.toml
+++ b/sa-index/Cargo.toml
@@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] }
 rayon = "1.8.1"
 serde = { version = "1.0.197", features = ["derive"] }
 sa-mappings = { path = "../sa-mappings" }
+text-compression = { path = "../text-compression" }
 bitarray = { path = "../bitarray" }
 serde_json = "1.0.116"
diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs
index f276906..53f5348 100644
--- a/sa-index/src/lib.rs
+++ b/sa-index/src/lib.rs
@@ -115,11 +115,11 @@ mod tests {
     #[test]
     fn test_suffix_array_compressed() {
         let mut bitarray = BitArray::with_capacity(5, 40);
-        bitarray.set(0, 1);
-        bitarray.set(1, 2);
-        bitarray.set(2, 3);
-        bitarray.set(3, 4);
-        bitarray.set(4, 5);
+        bitarray.set(0, 1 as u64);
+        bitarray.set(1, 2 as u64);
+        bitarray.set(2, 3 as u64);
+        bitarray.set(3, 4 as u64);
+        bitarray.set(4, 5 as u64);
 
         let sa = SuffixArray::Compressed(bitarray, 1);
         assert_eq!(sa.len(), 5);
diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index 0be7d17..2687de0 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -1,6 +1,7 @@
 use std::{cmp::min, ops::Deref};
 
 use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+use text_compression::ProteinTextSlice;
 
 use crate::{
     sa_searcher::BoundSearch::{Maximum, Minimum},
@@ -75,7 +76,7 @@ pub struct SparseSearcher(Searcher);
 
 impl SparseSearcher {
     pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
         Self(searcher)
     }
@@ -93,7 +94,7 @@ pub struct DenseSearcher(Searcher);
 
 impl DenseSearcher {
     pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
-        let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
         Self(searcher)
     }
@@ -176,12 +177,12 @@ impl Searcher {
 
         // match as long as possible
         while index_in_search_string < search_string.len()
-            && index_in_suffix < self.proteins.input_string.len()
-            && (search_string[index_in_search_string] == self.proteins.input_string[index_in_suffix]
+            && index_in_suffix < self.proteins.text.len()
+            && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix)
                 || (search_string[index_in_search_string] == b'L'
-                    && self.proteins.input_string[index_in_suffix] == b'I')
+                    && self.proteins.text.get(index_in_suffix) == b'I')
                 || (search_string[index_in_search_string] == b'I'
-                    && self.proteins.input_string[index_in_suffix] == b'L'))
+                    && self.proteins.text.get(index_in_suffix) == b'L'))
         {
             index_in_suffix += 1;
             index_in_search_string += 1;
@@ -191,7 +192,7 @@ impl Searcher {
         if !search_string.is_empty() {
             if index_in_search_string == search_string.len() {
                 is_cond_or_equal = true
-            } else if index_in_suffix < self.proteins.input_string.len() {
+            } else if index_in_suffix < self.proteins.text.len() {
                 // in our index every L was replaced by a I, so we need to replace them if we want
                 // to search in the right direction
                 let peptide_char = if search_string[index_in_search_string] == b'L' {
@@ -200,10 +201,10 @@ impl Searcher {
                     search_string[index_in_search_string]
                 };
 
-                let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' {
+                let protein_char = if self.proteins.text.get(index_in_suffix) == b'L' {
                     b'I'
                 } else {
-                    self.proteins.input_string[index_in_suffix]
+                    self.proteins.text.get(index_in_suffix)
                 };
 
                 is_cond_or_equal = condition_check(peptide_char, protein_char);
@@ -347,20 +348,20 @@ impl Searcher {
                         // check at all
                         if (skip == 0
                             || Self::check_prefix(
-                                current_search_string_prefix,
-                                &self.proteins.input_string[match_start..suffix],
-                                equate_il
-                            ))
+                            current_search_string_prefix,
+                            ProteinTextSlice::new(&self.proteins.text, match_start, suffix),
+                            equate_il
+                        ))
                             && Self::check_suffix(
-                                skip,
-                                il_locations_current_suffix,
-                                current_search_string_suffix,
-                                &self.proteins.input_string[suffix..match_end],
-                                equate_il
-                            )
+                            skip,
+                            il_locations_current_suffix,
+                            current_search_string_suffix,
+                            ProteinTextSlice::new(&self.proteins.text, suffix, match_end),
+                            equate_il
+                        )
                             && (!tryptic
-                                || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
-                                    && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
+                            || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
+                            && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
                         {
                             matching_suffixes.push((suffix - skip) as i64);
 
@@ -394,7 +395,7 @@ impl Searcher {
     /// Returns true if the cut is at the start of a protein.
     #[inline]
     fn check_start_of_protein(&self, cut_index: usize) -> bool {
-        cut_index == 0 || self.proteins.input_string[cut_index - 1] == SEPARATION_CHARACTER
+        cut_index == 0 || self.proteins.text.get(cut_index - 1) == SEPARATION_CHARACTER
     }
 
     /// Check if a cut is the end of a protein.
@@ -407,8 +408,8 @@ impl Searcher {
     /// Returns true if the cut is at the end of a protein.
     #[inline]
     fn check_end_of_protein(&self, cut_index: usize) -> bool {
-        self.proteins.input_string[cut_index] == TERMINATION_CHARACTER
-            || self.proteins.input_string[cut_index] == SEPARATION_CHARACTER
+        self.proteins.text.get(cut_index) == TERMINATION_CHARACTER
+            || self.proteins.text.get(cut_index) == SEPARATION_CHARACTER
     }
 
     /// Check if a cut is a tryptic cut, so check if the amino acid preceding the cut is K or R and the amino acid at the cut is not P.
@@ -421,8 +422,8 @@ impl Searcher {
     /// Returns true if the cut is a tryptic cut.
     #[inline]
     fn check_tryptic_cut(&self, cut_index: usize) -> bool {
-        (self.proteins.input_string[cut_index - 1] == b'K' || self.proteins.input_string[cut_index - 1] == b'R')
-            && self.proteins.input_string[cut_index] != b'P'
+        (self.proteins.text.get(cut_index - 1) == b'K' || self.proteins.text.get(cut_index - 1) == b'R')
+            && self.proteins.text.get(cut_index) != b'P'
     }
 
     /// Returns true of the prefixes are the same
@@ -438,16 +439,8 @@ impl Searcher {
     /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise
     /// false
     #[inline]
-    fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool {
-        if equate_il {
-            search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| {
-                search_character == index_character
-                    || (search_character == b'I' && index_character == b'L')
-                    || (search_character == b'L' && index_character == b'I')
-            })
-        } else {
-            search_string_prefix == index_prefix
-        }
+    fn check_prefix(search_string_prefix: &[u8], index_prefix: ProteinTextSlice, equate_il: bool) -> bool {
+        index_prefix.equals_slice(search_string_prefix, equate_il)
     }
 
     /// Returns true of the search_string and index_string are equal
@@ -471,20 +464,10 @@ impl Searcher {
         skip: usize,
         il_locations: &[usize],
         search_string: &[u8],
-        index_string: &[u8],
+        text_slice: ProteinTextSlice,
         equate_il: bool
     ) -> bool {
-        if equate_il {
-            true
-        } else {
-            for &il_location in il_locations {
-                let index = il_location - skip;
-                if search_string[index] != index_string[index] {
-                    return false;
-                }
-            }
-            true
-        }
+        if equate_il { true } else { text_slice.check_il_locations(skip, il_locations, search_string) }
     }
 
     /// Returns all the proteins that correspond with the provided suffixes
@@ -511,6 +494,7 @@ impl Searcher {
 #[cfg(test)]
 mod tests {
     use sa_mappings::proteins::{Protein, Proteins};
+    use text_compression::ProteinText;
 
     use crate::{
         sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher},
@@ -539,9 +523,11 @@ mod tests {
     }
 
     fn get_example_proteins() -> Proteins {
-        let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes();
+        let input_string = "AI-CLACVAA-AC-KCRLY$";
+        let text = ProteinText::from_string(input_string);
+
         Proteins {
-            input_string: text,
+            text,
             proteins: vec![
                 Protein {
                     uniprot_id: String::new(),
@@ -572,7 +558,7 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'A'
@@ -593,7 +579,7 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search suffix 'VAA'
@@ -610,14 +596,14 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         let bounds_res = searcher.search_bounds(&[b'I']);
         assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16)));
 
         // search bounds 'RIZ' with equal I and L
-        let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Z']);
+        let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Y']);
         assert_eq!(bounds_res, BoundSearchResult::SearchResult((17, 18)));
     }
 
@@ -626,25 +612,26 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'RIZ' with equal I and L
-        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true, false);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, true, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16]));
 
         // search bounds 'RIZ' without equal I and L
-        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false, false);
+        let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, false, false);
         assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
     }
 
     // test edge case where an I or L is the first index in the sparse SA.
     #[test]
     fn test_l_first_index_in_sa() {
-        let text = "LMOXZ$".to_string().into_bytes();
+        let input_string = "LMPYY$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -653,7 +640,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'IM' with equal I and L
@@ -663,10 +650,11 @@ mod tests {
 
     #[test]
     fn test_il_missing_matches() {
-        let text = "AAILLL$".to_string().into_bytes();
+        let input_string = "AAILLL$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -675,7 +663,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true, false);
@@ -684,10 +672,11 @@ mod tests {
 
     #[test]
     fn test_il_duplication() {
-        let text = "IIIILL$".to_string().into_bytes();
+        let input_string = "IIIILL$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -696,7 +685,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
@@ -705,10 +694,11 @@ mod tests {
 
     #[test]
     fn test_il_suffix_check() {
-        let text = "IIIILL$".to_string().into_bytes();
+        let input_string = "IIIILL$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -717,7 +707,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search all places where II is in the string IIIILL, but with a sparse SA
@@ -728,10 +718,11 @@ mod tests {
 
     #[test]
     fn test_il_duplication2() {
-        let text = "IILLLL$".to_string().into_bytes();
+        let input_string = "IILLLL$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -740,7 +731,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'IM' with equal I and L
@@ -750,10 +741,11 @@ mod tests {
 
     #[test]
     fn test_tryptic_search() {
-        let text = "PAA-AAKPKAPAA$".to_string().into_bytes();
+        let input_string = "PAA-AAKPKAPAA$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -762,7 +754,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![13, 3, 12, 11, 1, 4, 2, 5, 9, 8, 6, 10, 0, 7], 1);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         let found_suffixes_1 = searcher.search_matching_suffixes(&[b'P', b'A', b'A'], usize::MAX, false, true);
diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs
index 121b569..a6a4e93 100644
--- a/sa-index/src/suffix_to_protein_index.rs
+++ b/sa-index/src/suffix_to_protein_index.rs
@@ -1,5 +1,6 @@
 use clap::ValueEnum;
 use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+use text_compression::ProteinText;
 
 use crate::Nullable;
 
@@ -66,10 +67,10 @@ impl DenseSuffixToProtein {
     /// # Returns
     ///
     /// Returns a new DenseSuffixToProtein build over the provided text
-    pub fn new(text: &[u8]) -> Self {
+    pub fn new(text: &ProteinText) -> Self {
         let mut current_protein_index: u32 = 0;
         let mut suffix_index_to_protein: Vec<u32> = vec![];
-        for &char in text.iter() {
+        for char in text.iter() {
             if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
                 current_protein_index += 1;
                 suffix_index_to_protein.push(u32::NULL);
@@ -92,9 +93,9 @@ impl SparseSuffixToProtein {
     /// # Returns
     ///
     /// Returns a new SparseSuffixToProtein build over the provided text
-    pub fn new(text: &[u8]) -> Self {
+    pub fn new(text: &ProteinText) -> Self {
         let mut suffix_index_to_protein: Vec<i64> = vec![0];
-        for (index, &char) in text.iter().enumerate() {
+        for (index, char) in text.iter().enumerate() {
             if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
                 suffix_index_to_protein.push(index as i64 + 1);
             }
@@ -108,6 +109,7 @@ impl SparseSuffixToProtein {
 mod tests {
     use clap::ValueEnum;
     use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+    use text_compression::ProteinText;
 
     use crate::{
         suffix_to_protein_index::{
@@ -116,10 +118,10 @@ mod tests {
         Nullable
     };
 
-    fn build_text() -> Vec<u8> {
+    fn build_text() -> ProteinText {
         let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char));
         text.push(TERMINATION_CHARACTER as char);
-        text.into_bytes()
+        ProteinText::from_string(&text)
     }
 
     #[test]
diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml
index b20a2bf..d255f7c 100644
--- a/sa-mappings/Cargo.toml
+++ b/sa-mappings/Cargo.toml
@@ -11,3 +11,5 @@ tempdir = "0.3.7"
 [dependencies]
 fa-compression = { path = "../fa-compression" }
 bytelines = "2.5.0"
+bitarray = { path = "../bitarray" }
+text-compression = { path = "../text-compression" }
diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs
index f2b24cc..53e52b8 100644
--- a/sa-mappings/src/proteins.rs
+++ b/sa-mappings/src/proteins.rs
@@ -5,6 +5,7 @@ use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8};
 
 use bytelines::ByteLines;
 use fa_compression::algorithm1::{decode, encode};
+use text_compression::ProteinText;
 
 /// The separation character used in the input string
 pub static SEPARATION_CHARACTER: u8 = b'-';
@@ -28,7 +29,7 @@ pub struct Protein {
 /// A struct that represents a collection of proteins
 pub struct Proteins {
     /// The input string containing all proteins
-    pub input_string: Vec<u8>,
+    pub text: ProteinText,
 
     /// The proteins in the input string
     pub proteins: Vec<Protein>
@@ -46,7 +47,6 @@ impl Proteins {
     ///
     /// # Arguments
     /// * `file` - The path to the database file
-    /// * `taxon_aggregator` - The `TaxonAggregator` to use
     ///
     /// # Returns
     ///
@@ -86,16 +86,52 @@ impl Proteins {
 
         input_string.pop();
         input_string.push(TERMINATION_CHARACTER.into());
-        input_string.shrink_to_fit();
         proteins.shrink_to_fit();
-        Ok(Self { input_string: input_string.into_bytes(), proteins })
+
+        let text = ProteinText::from_string(&input_string);
+        Ok(Self { text, proteins })
+    }
+
+    /// Creates a `ProteinText` which represents all the proteins concatenated from the database file
+    ///
+    /// # Arguments
+    /// * `file` - The path to the database file
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Result` containing the `ProteinText`
+    ///
+    /// # Errors
+    ///
+    /// Returns a `Box<dyn Error>` if an error occurred while reading the database file
+    pub fn try_from_database_file_without_annotations(database_file: &str) -> Result<ProteinText, Box<dyn Error>> {
+        let mut input_string: String = String::new();
+
+        let file = File::open(database_file)?;
+
+        // Read the lines as bytes, since the input string is not guaranteed to be utf8
+        // because of the encoded functional annotations
+        let mut lines = ByteLines::new(BufReader::new(file));
+
+        while let Some(Ok(line)) = lines.next() {
+            let mut fields = line.split(|b| *b == b'\t');
+
+            // only get the taxon id and sequence from each line, we don't need the other parts
+            let sequence = from_utf8(fields.nth(2).unwrap())?;
+
+            input_string.push_str(&sequence.to_uppercase());
+            input_string.push(SEPARATION_CHARACTER.into());
+        }
+
+        let text = ProteinText::from_string(&input_string);
+
+        Ok(text)
     }
 
     /// Creates a `vec<u8>` which represents all the proteins concatenated from the database file
     ///
     /// # Arguments
     /// * `file` - The path to the database file
-    /// * `taxon_aggregator` - The `TaxonAggregator` to use
     ///
     /// # Returns
     ///
@@ -104,7 +140,7 @@ impl Proteins {
     /// # Errors
     ///
     /// Returns a `Box<dyn Error>` if an error occurred while reading the database file
-    pub fn try_from_database_file_without_annotations(database_file: &str) -> Result<Vec<u8>, Box<dyn Error>> {
+    pub fn try_from_database_file_uncompressed(database_file: &str) -> Result<Vec<u8>, Box<dyn Error>> {
         let mut input_string: String = String::new();
 
         let file = File::open(database_file)?;
@@ -181,8 +217,10 @@ mod tests {
 
     #[test]
     fn test_new_proteins() {
+        let input_string = "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG";
+        let text = ProteinText::from_string(&input_string);
         let proteins = Proteins {
-            input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes().to_vec(),
+            text,
             proteins: vec![
                 Protein {
                     uniprot_id: "P12345".to_string(),
@@ -197,7 +235,6 @@ mod tests {
             ]
         };
 
-        assert_eq!(proteins.input_string, "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes());
         assert_eq!(proteins.proteins.len(), 2);
         assert_eq!(proteins[0].uniprot_id, "P12345");
         assert_eq!(proteins[0].taxon_id, 1);
@@ -245,12 +282,7 @@ mod tests {
 
         let proteins = Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap()).unwrap();
 
-        let sep_char = SEPARATION_CHARACTER as char;
-        let end_char = TERMINATION_CHARACTER as char;
-        let expected = format!(
-            "MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}",
-            sep_char, sep_char, sep_char, end_char
-        );
-        assert_eq!(proteins, expected.as_bytes());
+        let expected = 'L' as u8;
+        assert_eq!(proteins.get(4), expected);
     }
 }
diff --git a/text-compression/Cargo.toml b/text-compression/Cargo.toml
new file mode 100644
index 0000000..c312a3c
--- /dev/null
+++ b/text-compression/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "text-compression"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+bitarray = { path = "../bitarray" }
diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
new file mode 100644
index 0000000..4866a6c
--- /dev/null
+++ b/text-compression/src/lib.rs
@@ -0,0 +1,632 @@
+use std::{
+    collections::HashMap,
+    error::Error,
+    io::{BufRead, Write}
+};
+
+use bitarray::{data_to_writer, Binary, BitArray};
+
+/// Structure representing the proteins, stored in a bit array using 5 bits per amino acid.
+pub struct ProteinText {
+    /// Bit array holding the sequence of amino acids
+    bit_array: BitArray,
+    /// Hashmap storing the mapping between the character as `u8` and a 5 bit number.
+    char_to_5bit: HashMap<u8, u8>,
+    /// Vector storing the mapping between the 5 bit number and the character as `u8`.
+    bit5_to_char: Vec<u8>
+}
+
+impl ProteinText {
+    /// Creates the hashmap storing the mappings between the characters as `u8` and 5 bit numbers.
+    ///
+    /// # Returns
+    ///
+    /// Returns the hashmap
+    fn create_char_to_5bit_hashmap() -> HashMap<u8, u8> {
+        let mut hashmap = HashMap::<u8, u8>::new();
+        for (i, c) in "ACDEFGHIKLMNPQRSTVWY-$".chars().enumerate() {
+            hashmap.insert(c as u8, i as u8);
+        }
+
+        hashmap
+    }
+
+    /// Creates the vector storing the mappings between the 5 bit numbers and the characters as `u8`.
+    ///
+    /// # Returns
+    ///
+    /// Returns the vector
+    fn create_bit5_to_char() -> Vec<u8> {
+        let mut vec = Vec::<u8>::new();
+        for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+            vec.push(c as u8);
+        }
+        vec
+    }
+
+    /// Creates the compressed text from a string.
+    ///
+    /// # Arguments
+    /// * `input_string` - The text (proteins) in string format
+    ///
+    /// # Returns
+    ///
+    /// An instance of `ProteinText`
+    pub fn from_string(input_string: &str) -> ProteinText {
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+        let bit5_to_char = ProteinText::create_bit5_to_char();
+
+        let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
+        for (i, c) in input_string.chars().enumerate() {
+            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+            bit_array.set(i, char_5bit as u64);
+        }
+
+        Self { bit_array, char_to_5bit, bit5_to_char }
+    }
+
+    /// Creates the compressed text from a vector.
+    ///
+    /// # Arguments
+    /// * `input_vec` - The text (proteins) in a vector with elements of type `u8` representing the amino acids.
+    ///
+    /// # Returns
+    ///
+    /// An instance of `ProteinText`
+    pub fn from_vec(input_vec: &[u8]) -> ProteinText {
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+        let bit5_to_char = ProteinText::create_bit5_to_char();
+
+        let mut bit_array = BitArray::with_capacity(input_vec.len(), 5);
+        for (i, e) in input_vec.iter().enumerate() {
+            let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet");
+            bit_array.set(i, char_5bit as u64);
+        }
+
+        Self { bit_array, char_to_5bit, bit5_to_char }
+    }
+
+    /// Creates the compressed text from a bit array.
+    ///
+    /// # Arguments
+    /// * `bit_array` - The text (proteins) in a bit array using 5 bits for each amino acid.
+    ///
+    /// # Returns
+    ///
+    /// An instance of `ProteinText`
+    pub fn new(bit_array: BitArray) -> ProteinText {
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+        let bit5_to_char = ProteinText::create_bit5_to_char();
+        Self { bit_array, char_to_5bit, bit5_to_char }
+    }
+
+    /// Creates an instance of `ProteinText` with a given capacity.
+    ///
+    /// # Arguments
+    /// * `capacity` - The amount of characters in the text.
+    ///
+    /// # Returns
+    ///
+    /// An instance of `ProteinText`
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self::new(BitArray::with_capacity(capacity, 5))
+    }
+
+    /// Search the character at a given position in the compressed text.
+    ///
+    /// # Arguments
+    /// * `index` - The index of the character to search.
+    ///
+    /// # Returns
+    ///
+    /// the character at position `index` as `u8`.
+    pub fn get(&self, index: usize) -> u8 {
+        let char_5bit = self.bit_array.get(index) as usize;
+        self.bit5_to_char[char_5bit]
+    }
+
+    /// Set the character at a given index.
+    ///
+    /// # Arguments
+    /// * `index` - The index of the character to change.
+    /// * `value` - The character to fill in as `u8`.
+    pub fn set(&mut self, index: usize, value: u8) {
+        let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet");
+        self.bit_array.set(index, char_5bit as u64);
+    }
+
+    /// Queries the length of the text.
+    ///
+    /// # Returns
+    ///
+    /// the length of the text
+    pub fn len(&self) -> usize {
+        self.bit_array.len()
+    }
+
+    /// Check if the text is empty (length 0).
+    ///
+    /// # Returns
+    ///
+    /// true if the the text has length 0, false otherwise.
+    pub fn is_empty(&self) -> bool {
+        self.bit_array.len() == 0
+    }
+
+    /// Clears the `BitArray`, setting all bits to 0.
+    pub fn clear(&mut self) {
+        self.bit_array.clear()
+    }
+
+    /// Get an iterator over the characters of the text.
+    ///
+    /// # Returns
+    ///
+    /// A `ProteinTextIterator`, which can iterate over the characters of the text.
+    pub fn iter(&self) -> ProteinTextIterator {
+        ProteinTextIterator { protein_text: self, index: 0 }
+    }
+
+    /// Get a slice of the text
+    ///
+    /// # Returns
+    ///
+    /// An `ProteinTextSlice` representing a slice of the text.
+    pub fn slice(&self, start: usize, end: usize) -> ProteinTextSlice {
+        ProteinTextSlice::new(self, start, end)
+    }
+}
+
+/// Structure representing a slice of a `ProteinText`.
+pub struct ProteinTextSlice<'a> {
+    /// The `Proteintext` of whih to take a slice.
+    text: &'a ProteinText,
+    /// The start of the slice.
+    start: usize, // included
+    /// The end of the slice.
+    end: usize // excluded
+}
+
+impl<'a> ProteinTextSlice<'a> {
+    /// Creates an instance of `ProteintextSlice`, given the text and boundaries.
+    ///
+    /// # Arguments
+    /// * `text` - The `Proteintext` representing the text of proteins with 5 bits per amino acid.
+    /// * `start` - The start of the slice.
+    /// * `end` - The end of the slice.
+    ///
+    /// # Returns
+    ///
+    /// An instance of `ProteinTextSlice`
+    pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice<'a> {
+        Self { text, start, end }
+    }
+
+    /// Get a character (amino acid) in the slice.
+    ///
+    /// # Arguments
+    /// * `index` - The index in the slice of the character to get.
+    ///
+    /// # Returns
+    ///
+    /// The character as `u8`.
+    pub fn get(&self, index: usize) -> u8 {
+        self.text.get(self.start + index)
+    }
+
+    /// Get the length of the slice.
+    ///
+    /// # Returns
+    ///
+    /// The length of the slice.
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Checks if the slice and a given array of `u8` are equal.
+    /// I and L can be equated.
+    ///
+    /// # Arguments
+    /// * `other` - the array of `u8` to compare the slice with.
+    /// * `equate_il` - true if I and L need to be equated, false otherwise.
+    ///
+    /// # Returns
+    ///
+    /// True if the slice is equal to the given array, false otherwise.
+    #[inline]
+    pub fn equals_slice(&self, other: &[u8], equate_il: bool) -> bool {
+        if equate_il {
+            other.iter().zip(self.iter()).all(|(&search_character, text_character)| {
+                search_character == text_character
+                    || (search_character == b'I' && text_character == b'L')
+                    || (search_character == b'L' && text_character == b'I')
+            })
+        } else {
+            other
+                .iter()
+                .zip(self.iter())
+                .all(|(&search_character, text_character)| search_character == text_character)
+        }
+    }
+
+    /// Check if the slice and a given array of `u8` are equal on the I and L positions.
+    ///
+    /// # Arguments
+    /// * `skip` - The amount of positions this slice skipped, this has an influence on the I and L positions.
+    /// * `il_locations` - The positions where I and L occur.
+    /// * `search_string` -  An array of `u8` to compare the slice with.
+    ///
+    /// # Returns
+    ///
+    /// True if the slice and `search_string` have the same contents on the I and L positions, false otherwise.
+    pub fn check_il_locations(&self, skip: usize, il_locations: &[usize], search_string: &[u8]) -> bool {
+        for &il_location in il_locations {
+            let index = il_location - skip;
+            if search_string[index] != self.get(index) {
+                return false;
+            }
+        }
+        true
+    }
+
+    /// Get an iterator over the slice.
+    ///
+    /// # Returns
+    ///
+    /// An iterator over the slice.
+    pub fn iter(&self) -> ProteinTextSliceIterator {
+        ProteinTextSliceIterator { text_slice: self, index: 0 }
+    }
+}
+
+/// Structure representing an iterator over a `ProteinText` instance, iterating the characters of the text.
+pub struct ProteinTextIterator<'a> {
+    protein_text: &'a ProteinText,
+    index: usize
+}
+
+/// Structure representing an iterator over a `ProteintextSlice` instance, iterating the characters of the slice.
+pub struct ProteinTextSliceIterator<'a> {
+    text_slice: &'a ProteinTextSlice<'a>,
+    index: usize
+}
+
+impl<'a> Iterator for ProteinTextSliceIterator<'a> {
+    type Item = u8;
+
+    /// Get the next character in the `ProteinTextSlice`.
+    ///
+    /// # Returns
+    ///
+    /// The next character in the slice.
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index >= self.text_slice.len() {
+            return None;
+        }
+
+        self.index += 1;
+        Some(self.text_slice.get(self.index - 1))
+    }
+}
+
+impl<'a> Iterator for ProteinTextIterator<'a> {
+    type Item = u8;
+
+    /// Get the next character in the `ProteinText`.
+    ///
+    /// # Returns
+    ///
+    /// The next character in the text.
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index >= self.protein_text.len() {
+            return None;
+        }
+
+        self.index += 1;
+        Some(self.protein_text.get(self.index - 1))
+    }
+}
+
+/// Writes the compressed text to a writer.
+///
+/// # Arguments
+///
+/// * `text` - The text to be compressed.
+/// * `writer` - The writer to which the compressed text will be written.
+///
+/// # Errors
+///
+/// Returns an error if writing to the writer fails.
+pub fn dump_compressed_text(text: Vec<u8>, writer: &mut impl Write) -> Result<(), Box<dyn Error>> {
+    let bits_per_value = 5;
+
+    // Write the flags to the writer
+    // 00000001 indicates that the text is compressed
+    writer
+        .write(&[bits_per_value as u8])
+        .map_err(|_| "Could not write the required bits to the writer")?;
+
+    // Write the size of the text to the writer
+    writer
+        .write(&(text.len() as u64).to_le_bytes())
+        .map_err(|_| "Could not write the size of the text to the writer")?;
+
+    // Compress the text and write it to the writer
+    let text_writer: Vec<i64> = text.iter().map(|item| <i64>::from(*item)).collect();
+    data_to_writer(text_writer, bits_per_value, 8 * 1024, writer)
+        .map_err(|_| "Could not write the compressed text to the writer")?;
+
+    Ok(())
+}
+
+/// Load the compressed text from a reader.
+///
+/// # Arguments
+///
+/// * `reader` - The reader from which the compressed text will be read.
+///
+/// # Errors
+///
+/// Returns an error if reading from the reader fails.
+pub fn load_compressed_text(reader: &mut impl BufRead) -> Result<ProteinText, Box<dyn Error>> {
+    let bits_per_value: usize = 5;
+    // Read the size of the text from the binary file (8 bytes)
+    let mut size_buffer = [0_u8; 8];
+    reader
+        .read_exact(&mut size_buffer)
+        .map_err(|_| "Could not read the size of the text from the binary file")?;
+    let size = u64::from_le_bytes(size_buffer) as usize;
+
+    // Read the compressed text from the binary file
+    let mut compressed_text = BitArray::with_capacity(size, bits_per_value);
+    compressed_text
+        .read_binary(reader)
+        .map_err(|_| "Could not read the compressed text from the binary file")?;
+
+    Ok(ProteinText::new(compressed_text))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Read;
+
+    use super::*;
+
+    pub struct FailingWriter {
+        /// The number of times the write function can be called before it fails.
+        pub valid_write_count: usize
+    }
+
+    impl Write for FailingWriter {
+        fn write(&mut self, _: &[u8]) -> Result<usize, std::io::Error> {
+            if self.valid_write_count == 0 {
+                return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed"));
+            }
+
+            self.valid_write_count -= 1;
+            Ok(1)
+        }
+
+        fn flush(&mut self) -> Result<(), std::io::Error> {
+            Ok(())
+        }
+    }
+
+    pub struct FailingReader {
+        /// The number of times the read function can be called before it fails.
+        pub valid_read_count: usize
+    }
+
+    impl Read for FailingReader {
+        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+            if self.valid_read_count == 0 {
+                return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed"));
+            }
+
+            self.valid_read_count -= 1;
+            Ok(buf.len())
+        }
+    }
+
+    impl BufRead for FailingReader {
+        fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
+            Ok(&[])
+        }
+
+        fn consume(&mut self, _: usize) {}
+    }
+
+    #[test]
+    fn test_u8_5bit_conversion() {
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+        let bit5_to_char = ProteinText::create_bit5_to_char();
+
+        for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+            let char_5bit = char_to_5bit.get(&(c as u8)).unwrap();
+            assert_eq!(c as u8, bit5_to_char[*char_5bit as usize]);
+        }
+    }
+
+    #[test]
+    fn test_build_from_string() {
+        let text = ProteinText::from_string("ACACA-CAC$");
+
+        for (i, c) in "ACACA-CAC$".chars().enumerate() {
+            assert_eq!(c as u8, text.get(i));
+        }
+    }
+
+    #[test]
+    fn test_build_from_vec() {
+        let vec = vec![b'A', b'C', b'A', b'C', b'A', b'-', b'C', b'A', b'C', b'$'];
+        let text = ProteinText::from_vec(&vec);
+
+        for (i, c) in "ACACA-CAC$".chars().enumerate() {
+            assert_eq!(c as u8, text.get(i));
+        }
+    }
+
+    #[test]
+    fn test_build_from_bitarray() {
+        let input_string = "ACACA-CAC$";
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+
+        let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
+        for (i, c) in input_string.chars().enumerate() {
+            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+            bit_array.set(i, char_5bit as u64);
+        }
+
+        let text = ProteinText::new(bit_array);
+
+        for (i, c) in "ACACA-CAC$".chars().enumerate() {
+            assert_eq!(c as u8, text.get(i));
+        }
+    }
+
+    #[test]
+    fn test_build_with_capacity() {
+        let input_string = "ACACA-CAC$";
+
+        let mut text = ProteinText::with_capacity(input_string.len());
+        for (i, c) in "ACACA-CAC$".chars().enumerate() {
+            text.set(i, c as u8);
+        }
+
+        for (i, c) in "ACACA-CAC$".chars().enumerate() {
+            assert_eq!(c as u8, text.get(i));
+        }
+    }
+
+    #[test]
+    fn test_text_slice() {
+        let input_string = "ACACA-CAC$";
+        let start = 1;
+        let end = 5;
+        let text = ProteinText::from_string(&input_string);
+        let text_slice = text.slice(start, end);
+
+        for (i, c) in input_string[start..end].chars().enumerate() {
+            assert_eq!(c as u8, text_slice.get(i));
+        }
+    }
+
+    #[test]
+    fn test_equals_slice() {
+        let input_string = "ACICA-CAC$";
+        let text = ProteinText::from_string(&input_string);
+        let text_slice = text.slice(1, 5);
+        let eq_slice_true = [b'C', b'I', b'C', b'A'];
+        let eq_slice_false = [b'C', b'C', b'C', b'A'];
+        let eq_slice_il_true = [b'C', b'L', b'C', b'A'];
+
+        assert!(text_slice.equals_slice(&eq_slice_true, false));
+        assert!(!text_slice.equals_slice(&eq_slice_false, false));
+        assert!(text_slice.equals_slice(&eq_slice_il_true, true));
+    }
+
+    #[test]
+    fn test_check_il_locations() {
+        let input_string = "ACILA-CAC$";
+        let text = ProteinText::from_string(&input_string);
+        let text_slice = text.slice(1, 5);
+        let il_locations = [1, 2];
+        let il_true = [b'C', b'I', b'L', b'A'];
+        let il_false = [b'C', b'I', b'C', b'A'];
+
+        assert!(text_slice.check_il_locations(0, &il_locations, &il_true));
+        assert!(!text_slice.check_il_locations(0, &il_locations, &il_false));
+    }
+
+    #[test]
+    fn test_dump_compressed_text() {
+        let text: Vec<u8> = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+
+        let mut writer = vec![];
+        dump_compressed_text(text, &mut writer).unwrap();
+
+        assert_eq!(writer, vec![
+            // bits per value
+            5, // size of the text
+            10, 0, 0, 0, 0, 0, 0, 0, // compressed text
+            0, 128, 74, 232, 152, 66, 134, 8
+        ]);
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not write the required bits to the writer")]
+    fn test_dump_compressed_text_fail_required_bits() {
+        let mut writer = FailingWriter { valid_write_count: 0 };
+
+        dump_compressed_text(vec![], &mut writer).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not write the size of the text to the writer")]
+    fn test_dump_compressed_text_fail_size() {
+        let mut writer = FailingWriter { valid_write_count: 1 };
+
+        dump_compressed_text(vec![], &mut writer).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not write the compressed text to the writer")]
+    fn test_dump_compressed_text_fail_compressed_text() {
+        let mut writer = FailingWriter { valid_write_count: 3 };
+
+        dump_compressed_text(vec![1], &mut writer).unwrap();
+    }
+
+    #[test]
+    fn test_load_compressed_text() {
+        let data = vec![
+            // size of the text
+            10, 0, 0, 0, 0, 0, 0, 0, // compressed text
+            0, 128, 74, 232, 152, 66, 134, 8,
+        ];
+
+        let mut reader = std::io::BufReader::new(&data[..]);
+        let compressed_text = load_compressed_text(&mut reader).unwrap();
+
+        for (i, c) in "CDEFGHIKLM".chars().enumerate() {
+            assert_eq!(compressed_text.get(i), c as u8);
+        }
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not read the size of the text from the binary file")]
+    fn test_load_compressed_text_fail_size() {
+        let mut reader = FailingReader { valid_read_count: 0 };
+
+        load_compressed_text(&mut reader).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not read the compressed text from the binary file")]
+    fn test_load_compressed_text_fail_compressed_text() {
+        let mut reader = FailingReader { valid_read_count: 2 };
+
+        load_compressed_text(&mut reader).unwrap();
+    }
+
+    #[test]
+    fn test_failing_writer() {
+        let mut writer = FailingWriter { valid_write_count: 0 };
+        assert!(writer.flush().is_ok());
+        assert!(writer.write(&[0]).is_err());
+    }
+
+    #[test]
+    fn test_failing_reader() {
+        let mut reader = FailingReader { valid_read_count: 0 };
+        let right_buffer: [u8; 0] = [];
+        assert_eq!(reader.fill_buf().unwrap(), &right_buffer);
+        assert_eq!(reader.consume(0), ());
+        let mut buffer = [0_u8; 1];
+        assert!(reader.read(&mut buffer).is_err());
+    }
+}
diff --git a/unipept-index.iml b/unipept-index.iml
new file mode 100644
index 0000000..ce5666f
--- /dev/null
+++ b/unipept-index.iml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="WEB_MODULE" version="4">
+  <component name="ModuleRunConfigurationManager">
+    <shared />
+  </component>
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/bitarray/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/fa-compression/benches" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/fa-compression/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/libsais64-rs/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/sa-builder/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/sa-compression/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/sa-index/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/sa-mappings/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/sa-server/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/text-compression/src" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/target" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file