From fb6e77a2f87f52aeb0b035c0579768b72200788f Mon Sep 17 00:00:00 2001
From: SimonVandeVyver <simon.vandevyver@ugent.be>
Date: Wed, 11 Sep 2024 08:50:04 +0200
Subject: [PATCH] Represent chars in protein text with 5 bits, tests don't work
 yet

---
 Cargo.lock                              |  10 +
 bitarray/src/binary.rs                  |   8 +-
 bitarray/src/lib.rs                     |  18 +-
 sa-index/Cargo.toml                     |   1 +
 sa-index/src/lib.rs                     |  10 +-
 sa-index/src/sa_searcher.rs             |  95 +++---
 sa-index/src/suffix_to_protein_index.rs |  14 +-
 sa-mappings/Cargo.toml                  |   2 +
 sa-mappings/src/proteins.rs             |  35 +--
 text-compression/Cargo.toml             |   9 +
 text-compression/src/lib.rs             | 391 ++++++++++++++++++++++++
 11 files changed, 506 insertions(+), 87 deletions(-)
 create mode 100644 text-compression/Cargo.toml
 create mode 100644 text-compression/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index c29abc3..9d81263 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1127,15 +1127,18 @@ dependencies = [
  "serde",
  "serde_json",
  "tempdir",
+ "text-compression",
 ]
 
 [[package]]
 name = "sa-mappings"
 version = "0.1.0"
 dependencies = [
+ "bitarray",
  "bytelines",
  "fa-compression",
  "tempdir",
+ "text-compression",
 ]
 
 [[package]]
@@ -1275,6 +1278,13 @@ dependencies = [
  "remove_dir_all",
 ]
 
+[[package]]
+name = "text-compression"
+version = "0.1.0"
+dependencies = [
+ "bitarray",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs
index e7265cd..a8084d1 100644
--- a/bitarray/src/binary.rs
+++ b/bitarray/src/binary.rs
@@ -159,10 +159,10 @@ mod tests {
     #[test]
     fn test_write_binary() {
         let mut bitarray = BitArray::with_capacity(4, 40);
-        bitarray.set(0, 0x1234567890);
-        bitarray.set(1, 0xabcdef0123);
-        bitarray.set(2, 0x4567890abc);
-        bitarray.set(3, 0xdef0123456);
+        bitarray.set(0, 0x1234567890_u64);
+        bitarray.set(1, 0xabcdef0123_u64);
+        bitarray.set(2, 0x4567890abc_u64);
+        bitarray.set(3, 0xdef0123456_u64);
 
         let mut buffer = Vec::new();
         bitarray.write_binary(&mut buffer).unwrap();
diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs
index 655d17e..fe7b532 100644
--- a/bitarray/src/lib.rs
+++ b/bitarray/src/lib.rs
@@ -19,7 +19,7 @@ pub struct BitArray {
     /// The length of the bit array.
     len: usize,
     /// The number of bits in a single element of the data vector.
-    bits_per_value: usize
+    bits_per_value: usize,
 }
 
 impl BitArray {
@@ -39,7 +39,7 @@ impl BitArray {
             data: vec![0; capacity * bits_per_value / 64 + extra],
             mask: (1 << bits_per_value) - 1,
             len: capacity,
-            bits_per_value
+            bits_per_value,
         }
     }
 
@@ -85,6 +85,7 @@ impl BitArray {
     /// * `index` - The index of the value to set.
     /// * `value` - The value to set at the specified index.
     pub fn set(&mut self, index: usize, value: u64) {
+        let value: u64 = value.into();
         let start_block = index * self.bits_per_value / 64;
         let start_block_offset = index * self.bits_per_value % 64;
 
@@ -142,6 +143,11 @@ impl BitArray {
     pub fn clear(&mut self) {
         self.data.iter_mut().for_each(|x| *x = 0);
     }
+
+    pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] {
+        &self.data[start_slice..end_slice]
+    }
+
 }
 
 /// Writes the data to a writer in a binary format using a bit array. This function is helpfull
@@ -257,10 +263,10 @@ mod tests {
     fn test_bitarray_set() {
         let mut bitarray = BitArray::with_capacity(4, 40);
 
-        bitarray.set(0, 0b0001110011111010110001000111111100110010);
-        bitarray.set(1, 0b1100001001010010011000010100110111001001);
-        bitarray.set(2, 0b1111001101001101101101101011101001010001);
-        bitarray.set(3, 0b0000100010010001010001001110101110011100);
+        bitarray.set(0, 0b0001110011111010110001000111111100110010_u64);
+        bitarray.set(1, 0b1100001001010010011000010100110111001001_u64);
+        bitarray.set(2, 0b1111001101001101101101101011101001010001_u64);
+        bitarray.set(3, 0b0000100010010001010001001110101110011100_u64);
 
         assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]);
     }
diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml
index de57fc9..25dda76 100644
--- a/sa-index/Cargo.toml
+++ b/sa-index/Cargo.toml
@@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] }
 rayon = "1.8.1"
 serde = { version = "1.0.197", features = ["derive"] }
 sa-mappings = { path = "../sa-mappings" }
+text-compression = { path = "../text-compression" }
 bitarray = { path = "../bitarray" }
 serde_json = "1.0.116"
diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs
index f276906..53f5348 100644
--- a/sa-index/src/lib.rs
+++ b/sa-index/src/lib.rs
@@ -115,11 +115,11 @@ mod tests {
     #[test]
     fn test_suffix_array_compressed() {
         let mut bitarray = BitArray::with_capacity(5, 40);
-        bitarray.set(0, 1);
-        bitarray.set(1, 2);
-        bitarray.set(2, 3);
-        bitarray.set(3, 4);
-        bitarray.set(4, 5);
+        bitarray.set(0, 1 as u64);
+        bitarray.set(1, 2 as u64);
+        bitarray.set(2, 3 as u64);
+        bitarray.set(3, 4 as u64);
+        bitarray.set(4, 5 as u64);
 
         let sa = SuffixArray::Compressed(bitarray, 1);
         assert_eq!(sa.len(), 5);
diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index d09c704..7f60cbb 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -1,6 +1,7 @@
 use std::{cmp::min, ops::Deref};
 
 use sa_mappings::proteins::{Protein, Proteins};
+use text_compression::ProteinTextSlice;
 
 use crate::{
     sa_searcher::BoundSearch::{Maximum, Minimum},
@@ -75,7 +76,7 @@ pub struct SparseSearcher(Searcher);
 
 impl SparseSearcher {
     pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
         Self(searcher)
     }
@@ -93,7 +94,7 @@ pub struct DenseSearcher(Searcher);
 
 impl DenseSearcher {
     pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
-        let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
         Self(searcher)
     }
@@ -176,12 +177,12 @@ impl Searcher {
 
         // match as long as possible
         while index_in_search_string < search_string.len()
-            && index_in_suffix < self.proteins.input_string.len()
-            && (search_string[index_in_search_string] == self.proteins.input_string[index_in_suffix]
+            && index_in_suffix < self.proteins.text.len()
+            && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix) as u8
                 || (search_string[index_in_search_string] == b'L'
-                    && self.proteins.input_string[index_in_suffix] == b'I')
+                    && self.proteins.text.get(index_in_suffix) as u8 == b'I')
                 || (search_string[index_in_search_string] == b'I'
-                    && self.proteins.input_string[index_in_suffix] == b'L'))
+                    && self.proteins.text.get(index_in_suffix) as u8 == b'L'))
         {
             index_in_suffix += 1;
             index_in_search_string += 1;
@@ -191,7 +192,7 @@ impl Searcher {
         if !search_string.is_empty() {
             if index_in_search_string == search_string.len() {
                 is_cond_or_equal = true
-            } else if index_in_suffix < self.proteins.input_string.len() {
+            } else if index_in_suffix < self.proteins.text.len() {
                 // in our index every L was replaced by a I, so we need to replace them if we want
                 // to search in the right direction
                 let peptide_char = if search_string[index_in_search_string] == b'L' {
@@ -200,10 +201,10 @@ impl Searcher {
                     search_string[index_in_search_string]
                 };
 
-                let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' {
+                let protein_char = if self.proteins.text.get(index_in_suffix) as u8 == b'L' {
                     b'I'
                 } else {
-                    self.proteins.input_string[index_in_suffix]
+                    self.proteins.text.get(index_in_suffix) as u8
                 };
 
                 is_cond_or_equal = condition_check(peptide_char, protein_char);
@@ -340,16 +341,14 @@ impl Searcher {
                     // check at all
                     if suffix >= skip
                         && ((skip == 0
-                            || Self::check_prefix(
-                                current_search_string_prefix,
-                                &self.proteins.input_string[suffix - skip..suffix],
-                                equate_il
-                            ))
-                            && Self::check_suffix(
+                            || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix)
+                                .equals_slice(current_search_string_prefix, equate_il))
+                            && 
+                            Self::check_suffix(
                                 skip,
                                 il_locations_current_suffix,
                                 current_search_string_suffix,
-                                &self.proteins.input_string[suffix..suffix + search_string.len() - skip],
+                                ProteinTextSlice::new(&self.proteins.text, suffix, suffix + search_string.len() - skip),
                                 equate_il
                             ))
                     {
@@ -419,19 +418,13 @@ impl Searcher {
         skip: usize,
         il_locations: &[usize],
         search_string: &[u8],
-        index_string: &[u8],
+        text_slice: ProteinTextSlice,
         equate_il: bool
     ) -> bool {
         if equate_il {
             true
         } else {
-            for &il_location in il_locations {
-                let index = il_location - skip;
-                if search_string[index] != index_string[index] {
-                    return false;
-                }
-            }
-            true
+            text_slice.check_il_locations(skip, il_locations, search_string)
         }
     }
 
@@ -459,6 +452,7 @@ impl Searcher {
 #[cfg(test)]
 mod tests {
     use sa_mappings::proteins::{Protein, Proteins};
+    use text_compression::ProteinText;
 
     use crate::{
         sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher},
@@ -487,9 +481,11 @@ mod tests {
     }
 
     fn get_example_proteins() -> Proteins {
-        let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes();
+        let input_string = "AI-BLACVAA-AC-KCRLZ$";
+        let text = ProteinText::from_string(input_string);
+
         Proteins {
-            input_string: text,
+            text,
             proteins: vec![
                 Protein {
                     uniprot_id: String::new(),
@@ -520,7 +516,7 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'A'
@@ -541,7 +537,7 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search suffix 'VAA'
@@ -558,7 +554,7 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         let bounds_res = searcher.search_bounds(&[b'I']);
@@ -574,7 +570,7 @@ mod tests {
         let proteins = get_example_proteins();
         let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);
 
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'RIZ' with equal I and L
@@ -589,10 +585,11 @@ mod tests {
     // test edge case where an I or L is the first index in the sparse SA.
     #[test]
     fn test_l_first_index_in_sa() {
-        let text = "LMOXZ$".to_string().into_bytes();
+        let input_string = "LMOXZ$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -601,7 +598,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'IM' with equal I and L
@@ -611,10 +608,11 @@ mod tests {
 
     #[test]
     fn test_il_missing_matches() {
-        let text = "AAILLL$".to_string().into_bytes();
+        let input_string = "AAILLL$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -623,7 +621,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true);
@@ -632,19 +630,20 @@ mod tests {
 
     #[test]
     fn test_il_duplication() {
-        let text = "IIIILL$".to_string().into_bytes();
+        let input_string = "IIIILL$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
                 functional_annotations: vec![]
             }]
         };
-
+        
         let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true);
@@ -653,10 +652,11 @@ mod tests {
 
     #[test]
     fn test_il_suffix_check() {
-        let text = "IIIILL$".to_string().into_bytes();
-
+        let input_string = "IIIILL$";
+        let text = ProteinText::from_string(input_string);
+        
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -665,7 +665,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search all places where II is in the string IIIILL, but with a sparse SA
@@ -676,10 +676,11 @@ mod tests {
 
     #[test]
     fn test_il_duplication2() {
-        let text = "IILLLL$".to_string().into_bytes();
+        let input_string = "IILLLL$";
+        let text = ProteinText::from_string(input_string);
 
         let proteins = Proteins {
-            input_string: text,
+            text,
             proteins: vec![Protein {
                 uniprot_id: String::new(),
                 taxon_id: 0,
@@ -688,7 +689,7 @@ mod tests {
         };
 
         let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
-        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+        let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
         let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
 
         // search bounds 'IM' with equal I and L
diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs
index 121b569..6aed362 100644
--- a/sa-index/src/suffix_to_protein_index.rs
+++ b/sa-index/src/suffix_to_protein_index.rs
@@ -2,6 +2,7 @@ use clap::ValueEnum;
 use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
 
 use crate::Nullable;
+use text_compression::ProteinText;
 
 /// Enum used to define the commandline arguments and choose which index style is used
 #[derive(ValueEnum, Clone, Debug, PartialEq)]
@@ -66,10 +67,10 @@ impl DenseSuffixToProtein {
     /// # Returns
     ///
     /// Returns a new DenseSuffixToProtein build over the provided text
-    pub fn new(text: &[u8]) -> Self {
+    pub fn new(text: &ProteinText) -> Self {
         let mut current_protein_index: u32 = 0;
         let mut suffix_index_to_protein: Vec<u32> = vec![];
-        for &char in text.iter() {
+        for char in text.iter() {
             if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
                 current_protein_index += 1;
                 suffix_index_to_protein.push(u32::NULL);
@@ -92,9 +93,9 @@ impl SparseSuffixToProtein {
     /// # Returns
     ///
     /// Returns a new SparseSuffixToProtein build over the provided text
-    pub fn new(text: &[u8]) -> Self {
+    pub fn new(text: &ProteinText) -> Self {
         let mut suffix_index_to_protein: Vec<i64> = vec![0];
-        for (index, &char) in text.iter().enumerate() {
+        for (index, char) in text.iter().enumerate() {
             if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
                 suffix_index_to_protein.push(index as i64 + 1);
             }
@@ -108,6 +109,7 @@ impl SparseSuffixToProtein {
 mod tests {
     use clap::ValueEnum;
     use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+    use text_compression::ProteinText;
 
     use crate::{
         suffix_to_protein_index::{
@@ -116,10 +118,10 @@ mod tests {
         Nullable
     };
 
-    fn build_text() -> Vec<u8> {
+    fn build_text() -> ProteinText {
         let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char));
         text.push(TERMINATION_CHARACTER as char);
-        text.into_bytes()
+        ProteinText::from_string(&text)
     }
 
     #[test]
diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml
index b20a2bf..d255f7c 100644
--- a/sa-mappings/Cargo.toml
+++ b/sa-mappings/Cargo.toml
@@ -11,3 +11,5 @@ tempdir = "0.3.7"
 [dependencies]
 fa-compression = { path = "../fa-compression" }
 bytelines = "2.5.0"
+bitarray = { path = "../bitarray" }
+text-compression = { path = "../text-compression" }
diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs
index f2b24cc..ca3bdd7 100644
--- a/sa-mappings/src/proteins.rs
+++ b/sa-mappings/src/proteins.rs
@@ -5,6 +5,7 @@ use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8};
 
 use bytelines::ByteLines;
 use fa_compression::algorithm1::{decode, encode};
+use text_compression::ProteinText;
 
 /// The separation character used in the input string
 pub static SEPARATION_CHARACTER: u8 = b'-';
@@ -28,7 +29,7 @@ pub struct Protein {
 /// A struct that represents a collection of proteins
 pub struct Proteins {
     /// The input string containing all proteins
-    pub input_string: Vec<u8>,
+    pub text: ProteinText,
 
     /// The proteins in the input string
     pub proteins: Vec<Protein>
@@ -86,12 +87,13 @@ impl Proteins {
 
         input_string.pop();
         input_string.push(TERMINATION_CHARACTER.into());
-        input_string.shrink_to_fit();
         proteins.shrink_to_fit();
-        Ok(Self { input_string: input_string.into_bytes(), proteins })
+
+        let text = ProteinText::from_string(&input_string);
+        Ok(Self { text, proteins })
     }
 
-    /// Creates a `vec<u8>` which represents all the proteins concatenated from the database file
+    /// Creates a `ProteinText` which represents all the proteins concatenated from the database file
     ///
     /// # Arguments
     /// * `file` - The path to the database file
@@ -99,12 +101,12 @@ impl Proteins {
     ///
     /// # Returns
     ///
-    /// Returns a `Result` containing the `Vec<u8>`
+    /// Returns a `Result` containing the `ProteinText`
     ///
     /// # Errors
     ///
     /// Returns a `Box<dyn Error>` if an error occurred while reading the database file
-    pub fn try_from_database_file_without_annotations(database_file: &str) -> Result<Vec<u8>, Box<dyn Error>> {
+    pub fn try_from_database_file_without_annotations(database_file: &str) -> Result<ProteinText, Box<dyn Error>> {
         let mut input_string: String = String::new();
 
         let file = File::open(database_file)?;
@@ -123,11 +125,10 @@ impl Proteins {
             input_string.push(SEPARATION_CHARACTER.into());
         }
 
-        input_string.pop();
-        input_string.push(TERMINATION_CHARACTER.into());
+        let text = ProteinText::from_string(&input_string);
+
+        Ok(text)
 
-        input_string.shrink_to_fit();
-        Ok(input_string.into_bytes())
     }
 }
 
@@ -181,8 +182,10 @@ mod tests {
 
     #[test]
     fn test_new_proteins() {
+        let input_string = "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG";
+        let text = ProteinText::from_string(&input_string);
         let proteins = Proteins {
-            input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes().to_vec(),
+            text,
             proteins: vec![
                 Protein {
                     uniprot_id: "P12345".to_string(),
@@ -197,7 +200,6 @@ mod tests {
             ]
         };
 
-        assert_eq!(proteins.input_string, "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes());
         assert_eq!(proteins.proteins.len(), 2);
         assert_eq!(proteins[0].uniprot_id, "P12345");
         assert_eq!(proteins[0].taxon_id, 1);
@@ -245,12 +247,7 @@ mod tests {
 
         let proteins = Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap()).unwrap();
 
-        let sep_char = SEPARATION_CHARACTER as char;
-        let end_char = TERMINATION_CHARACTER as char;
-        let expected = format!(
-            "MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}",
-            sep_char, sep_char, sep_char, end_char
-        );
-        assert_eq!(proteins, expected.as_bytes());
+        let expected = 'L' as u8;
+        assert_eq!(proteins.get(4), expected);
     }
 }
diff --git a/text-compression/Cargo.toml b/text-compression/Cargo.toml
new file mode 100644
index 0000000..c312a3c
--- /dev/null
+++ b/text-compression/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "text-compression"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+bitarray = { path = "../bitarray" }
diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
new file mode 100644
index 0000000..60b2463
--- /dev/null
+++ b/text-compression/src/lib.rs
@@ -0,0 +1,391 @@
+use std::{
+    error::Error,
+    io::{BufRead, Write}
+};
+use std::collections::HashMap;
+
+use bitarray::{data_to_writer, Binary, BitArray};
+
+pub struct ProteinText {
+    bit_array: BitArray,
+    char_to_5bit: HashMap<u8, u8>,
+    bit5_to_char: Vec<u8>,
+}
+
+impl ProteinText {
+
+    fn create_char_to_5bit_hashmap() -> HashMap<u8, u8> {
+        let mut hashmap = HashMap::<u8, u8>::new();
+        for (i, c) in "ACDEFGHIKLMNPQRSTVWY-".chars().enumerate() {
+            hashmap.insert(c as u8, i as u8);
+        }
+
+        hashmap
+    }
+
+    fn create_bit5_to_char() -> Vec<u8> {
+        let mut vec = Vec::<u8>::new();
+        for c in "ACDEFGHIKLMNPQRSTVWY-".chars() {
+            vec.push(c as u8);
+        }
+        vec
+    }
+    
+    pub fn from_string(input_string: &str) -> ProteinText {
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+        let bit5_to_char = ProteinText::create_bit5_to_char();
+
+        let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
+        for (i, c) in input_string.chars().enumerate() {
+            let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+            bit_array.set(i, char_5bit as u64);
+        }
+
+        Self { bit_array, char_to_5bit, bit5_to_char }
+    }
+
+    pub fn from_vec(input_vec: &Vec<u8>) -> ProteinText {
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+        let bit5_to_char = ProteinText::create_bit5_to_char();
+
+        let mut bit_array = BitArray::with_capacity(input_vec.len(), 5);
+        for (i, e) in input_vec.iter().enumerate() {
+            let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet");
+            bit_array.set(i, char_5bit as u64);
+        }
+
+        Self { bit_array, char_to_5bit, bit5_to_char }
+    }
+
+    pub fn new(bit_array: BitArray) -> ProteinText {
+        let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+        let bit5_to_char = ProteinText::create_bit5_to_char();
+        Self { bit_array, char_to_5bit, bit5_to_char }
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self::new(BitArray::with_capacity(capacity, 5))
+    }
+
+    pub fn get(&self, index: usize) -> u8 {
+        let char_5bit = self.bit_array.get(index) as usize;
+        self.bit5_to_char[char_5bit]
+    }
+
+    pub fn set(&mut self, index: usize, value: u8) {
+        let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet");
+        self.bit_array.set(index, char_5bit as u64);
+    }
+
+    pub fn len(&self) -> usize {
+        self.bit_array.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.bit_array.len() == 0
+    }
+
+    /// Clears the `BitArray`, setting all bits to 0.
+    pub fn clear(&mut self) {
+        self.bit_array.clear()
+    }
+
+    pub fn iter(&self) -> ProteinTextIterator {
+        ProteinTextIterator {protein_text: self, index: 0, }
+    }
+
+}
+
+pub struct ProteinTextSlice<'a> {
+    text: &'a ProteinText,
+    start: usize, // included
+    end: usize,   // excluded
+}
+
+impl<'a> ProteinTextSlice<'a> {
+
+    pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice {
+        Self {text, start, end }
+    }
+
+    pub fn get(&self, index: usize) -> u8 {
+        self.text.get(self.start + index)
+    }
+
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
+
+    #[inline]
+    pub fn equals_slice(&self, other: &[u8], equate_il: bool) -> bool {
+        if equate_il {
+            other.iter().zip(self.iter()).all(|(&search_character, text_character)| {
+                search_character == text_character
+                    || (search_character == b'I' && text_character == b'L')
+                    || (search_character == b'L' && text_character == b'I')
+            })
+        } else {
+            other.iter().zip(self.iter()).all(|(&search_character, text_character)| search_character == text_character)
+        }
+    }
+
+    pub fn check_il_locations(
+        &self,
+        skip: usize,
+        il_locations: &[usize],
+        search_string: &[u8],
+    ) -> bool {
+        for &il_location in il_locations {
+            let index = il_location - skip;
+            if search_string[index] != self.get(index) {
+                return false;
+            }
+        }
+        true
+    }
+
+    pub fn iter(&self) -> ProteinTextSliceIterator {
+        ProteinTextSliceIterator {text_slice: self, index: 0, }
+    }
+}
+
+pub struct ProteinTextIterator<'a> {
+    protein_text: &'a ProteinText,
+    index: usize,
+}
+
+pub struct ProteinTextSliceIterator<'a> {
+    text_slice: &'a ProteinTextSlice<'a>,
+    index: usize,
+}
+
+impl<'a> Iterator for ProteinTextSliceIterator<'a> {
+
+    type Item = u8;
+    
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index >= self.text_slice.len() {
+            return None;
+        }
+
+        self.index += 1;
+        Some(self.text_slice.get(self.index - 1))
+    }
+}
+
+impl<'a> Iterator for ProteinTextIterator<'a> {
+
+    type Item = u8;
+    
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index >= self.protein_text.len() {
+            return None;
+        }
+
+        self.index += 1;
+        Some(self.protein_text.get(self.index - 1))
+    }
+}
+
+/// Writes the compressed text to a writer.
+///
+/// # Arguments
+///
+/// * `text` - The text to be compressed.
+/// * `writer` - The writer to which the compressed text will be written.
+///
+/// # Errors
+///
+/// Returns an error if writing to the writer fails.
+pub fn dump_compressed_text(
+    text: Vec<u8>,
+    writer: &mut impl Write
+) -> Result<(), Box<dyn Error>> {
+    let bits_per_value = 5;
+
+    // Write the flags to the writer
+    // 00000001 indicates that the text is compressed
+    writer
+        .write(&[bits_per_value as u8])
+        .map_err(|_| "Could not write the required bits to the writer")?;
+
+    // Write the size of the text to the writer
+    writer
+        .write(&(text.len() as u64).to_le_bytes())
+        .map_err(|_| "Could not write the size of the text to the writer")?;
+
+    // Compress the text and write it to the writer
+    let text_writer: Vec<i64> = text.iter().map(|item| <i64>::from(*item)).collect();
+    data_to_writer(text_writer, bits_per_value, 8 * 1024, writer)
+        .map_err(|_| "Could not write the compressed text to the writer")?;
+
+    Ok(())
+}
+
+/// Load the compressed text from a reader.
+///
+/// # Arguments
+///
+/// * `reader` - The reader from which the compressed text will be read.
+///
+/// # Errors
+///
+/// Returns an error if reading from the reader fails.
+pub fn load_compressed_text(
+    reader: &mut impl BufRead
+) -> Result<ProteinText, Box<dyn Error>> {
+    let bits_per_value: usize = 5;
+    // Read the size of the text from the binary file (8 bytes)
+    let mut size_buffer = [0_u8; 8];
+    reader
+        .read_exact(&mut size_buffer)
+        .map_err(|_| "Could not read the size of the text from the binary file")?;
+    let size = u64::from_le_bytes(size_buffer) as usize;
+
+    // Read the compressed text from the binary file
+    let mut compressed_text = BitArray::with_capacity(size, bits_per_value);
+    compressed_text
+        .read_binary(reader)
+        .map_err(|_| "Could not read the compressed text from the binary file")?;
+
+    Ok(ProteinText::new(compressed_text))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Read;
+
+    use super::*;
+
+    pub struct FailingWriter {
+        /// The number of times the write function can be called before it fails.
+        pub valid_write_count: usize
+    }
+
+    impl Write for FailingWriter {
+        fn write(&mut self, _: &[u8]) -> Result<usize, std::io::Error> {
+            if self.valid_write_count == 0 {
+                return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed"));
+            }
+
+            self.valid_write_count -= 1;
+            Ok(1)
+        }
+
+        fn flush(&mut self) -> Result<(), std::io::Error> {
+            Ok(())
+        }
+    }
+
+    pub struct FailingReader {
+        /// The number of times the read function can be called before it fails.
+        pub valid_read_count: usize
+    }
+
+    impl Read for FailingReader {
+        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+            if self.valid_read_count == 0 {
+                return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed"));
+            }
+
+            self.valid_read_count -= 1;
+            Ok(buf.len())
+        }
+    }
+
+    impl BufRead for FailingReader {
+        fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
+            Ok(&[])
+        }
+
+        fn consume(&mut self, _: usize) {}
+    }
+
+    #[test]
+    fn test_dump_compressed_text() {
+        let text: Vec<u8> = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+
+        let mut writer = vec![];
+        dump_compressed_text(text, &mut writer).unwrap();
+
+        assert_eq!(writer, vec![
+            // bits per value
+            5, // size of the text
+            10, 0, 0, 0, 0, 0, 0, 0, // compressed text
+            0, 128, 74, 232, 152, 66, 134, 8
+        ]);
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not write the required bits to the writer")]
+    fn test_dump_compressed_text_fail_required_bits() {
+        let mut writer = FailingWriter { valid_write_count: 0 };
+
+        dump_compressed_text(vec![], &mut writer).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not write the size of the text to the writer")]
+    fn test_dump_compressed_text_fail_size() {
+        let mut writer = FailingWriter { valid_write_count: 1 };
+
+        dump_compressed_text(vec![], &mut writer).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not write the compressed text to the writer")]
+    fn test_dump_compressed_text_fail_compressed_text() {
+        let mut writer = FailingWriter { valid_write_count: 3 };
+
+        dump_compressed_text(vec![1], &mut writer).unwrap();
+    }
+
+    #[test]
+    fn test_load_compressed_text() {
+        let data = vec![
+             // size of the text
+            10, 0, 0, 0, 0, 0, 0, 0, // compressed text
+            0, 128, 74, 232, 152, 66, 134, 8
+        ];
+
+        let mut reader = std::io::BufReader::new(&data[..]);
+        let compressed_text = load_compressed_text(&mut reader).unwrap();
+
+        for i in 0..10 {
+            assert_eq!(compressed_text.get(i), i as u8 + 1);
+        }
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not read the size of the text from the binary file")]
+    fn test_load_compressed_text_fail_size() {
+        let mut reader = FailingReader { valid_read_count: 0 };
+
+        load_compressed_text(&mut reader).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Could not read the compressed text from the binary file")]
+    fn test_load_compressed_text_fail_compressed_text() {
+        let mut reader = FailingReader { valid_read_count: 2 };
+
+        load_compressed_text(&mut reader).unwrap();
+    }
+
+    #[test]
+    fn test_failing_writer() {
+        let mut writer = FailingWriter { valid_write_count: 0 };
+        assert!(writer.flush().is_ok());
+        assert!(writer.write(&[0]).is_err());
+    }
+
+    #[test]
+    fn test_failing_reader() {
+        let mut reader = FailingReader { valid_read_count: 0 };
+        let right_buffer: [u8; 0] = [];
+        assert_eq!(reader.fill_buf().unwrap(), &right_buffer);
+        assert_eq!(reader.consume(0), ());
+        let mut buffer = [0_u8; 1];
+        assert!(reader.read(&mut buffer).is_err());
+    }
+}