diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..a9084f7
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
+{
+ "name": "Unipept Index",
+ "image": "mcr.microsoft.com/devcontainers/base:ubuntu",
+
+ // Features to add to the dev container. More info: https://containers.dev/features.
+ "features": {
+ "ghcr.io/devcontainers/features/rust:1": {}
+ },
+
+ // Use 'forwardPorts' to make a list of ports inside the container available locally.
+ // "forwardPorts": [],
+
+ // Use 'postCreateCommand' to run commands after the container is created.
+ // "postCreateCommand": "",
+
+ // Configure tool-specific properties.
+ // "customizations": {},
+
+ // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+ // "remoteUser": "root"
+}
diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml
new file mode 100644
index 0000000..2b12566
--- /dev/null
+++ b/.github/workflows/build_index.yml
@@ -0,0 +1,82 @@
+name: Build index binaries
+
+on:
+ schedule:
+ # Run on the first day of every month at midnight UTC
+ - cron: '0 0 1 * *'
+ push:
+ branches:
+ - feature/build_index_action
+ workflow_dispatch:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ # Check out the most recent version of the repository with submodules
+ - name: Check out repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ # Set up Rust toolchain
+ - name: Set up Rust
+ uses: dtolnay/rust-toolchain@stable
+
+ # Compile Rust code
+ - name: Compile Rust code
+ run: cargo build --release
+
+ # Create a directory "build"
+ - name: Create build directory
+ run: mkdir -p build/input
+
+ # Download the file "suffix-array.zip" from the most recent release of "unipept-database"
+ - name: Download suffix-array.zip
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ latest_release_url=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep "browser_download_url.*suffix-array.zip" | cut -d '"' -f 4)
+ release_date=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep '"published_at":' | cut -d '"' -f 4 | cut -d'T' -f1)
+ release_date_formatted=$(date -d $release_date "+%Y-%m-%d")
+ SP_VERSION="SP_$release_date_formatted"
+ echo "SP_VERSION=$SP_VERSION" >> $GITHUB_ENV
+ curl -L -o build/suffix-array.zip $latest_release_url
+
+ # Extract the contents of the output folder from the zip into a folder "build/input"
+ - name: Extract zip contents
+ run: |
+ unzip build/suffix-array.zip '*' -d build/input
+
+ # Make a directory with the SP_VERSION and process files
+ - name: Process files
+ run: |
+ mkdir -p build/$SP_VERSION
+ lz4 -d build/input/uniprot_entries.tsv.lz4 | cut -f2,4,7,8 > build/$SP_VERSION/proteins.tsv
+ lz4 -d build/input/taxons.tsv.lz4 > build/$SP_VERSION/taxons.tsv
+
+ # Step 8: Run the sa-builder command
+ - name: Run sa-builder
+ run: |
+ prefix="build/$SP_VERSION"
+ ./target/release/sa-builder -d "$prefix/proteins.tsv" -o "$prefix/sa_sparse3_compressed.bin" -s 3 -a lib-div-suf-sort -c
+
+ # Zip the contents of the build/$SP_VERSION directory
+ - name: Zip build contents
+ run: |
+ cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin"
+
+ # Create a GitHub release and upload the zip file
+ - name: Upload or Update Release
+ id: upload_or_update_release
+ uses: softprops/action-gh-release@v1
+ with:
+ files: build/${{ env.SP_VERSION }}/index_${{ env.SP_VERSION }}.zip
+ tag_name: index-${{ env.SP_VERSION }}
+ name: Index ${{ env.SP_VERSION }}
+ commitish: ${{ github.sha }}
+ draft: false
+ prerelease: false
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..92b44ab
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index c29abc3..9d81263 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1127,15 +1127,18 @@ dependencies = [
"serde",
"serde_json",
"tempdir",
+ "text-compression",
]
[[package]]
name = "sa-mappings"
version = "0.1.0"
dependencies = [
+ "bitarray",
"bytelines",
"fa-compression",
"tempdir",
+ "text-compression",
]
[[package]]
@@ -1275,6 +1278,13 @@ dependencies = [
"remove_dir_all",
]
+[[package]]
+name = "text-compression"
+version = "0.1.0"
+dependencies = [
+ "bitarray",
+]
+
[[package]]
name = "tinytemplate"
version = "1.2.1"
diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs
index e7265cd..a8084d1 100644
--- a/bitarray/src/binary.rs
+++ b/bitarray/src/binary.rs
@@ -159,10 +159,10 @@ mod tests {
#[test]
fn test_write_binary() {
let mut bitarray = BitArray::with_capacity(4, 40);
- bitarray.set(0, 0x1234567890);
- bitarray.set(1, 0xabcdef0123);
- bitarray.set(2, 0x4567890abc);
- bitarray.set(3, 0xdef0123456);
+ bitarray.set(0, 0x1234567890_u64);
+ bitarray.set(1, 0xabcdef0123_u64);
+ bitarray.set(2, 0x4567890abc_u64);
+ bitarray.set(3, 0xdef0123456_u64);
let mut buffer = Vec::new();
bitarray.write_binary(&mut buffer).unwrap();
diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs
index 655d17e..901b395 100644
--- a/bitarray/src/lib.rs
+++ b/bitarray/src/lib.rs
@@ -85,6 +85,7 @@ impl BitArray {
/// * `index` - The index of the value to set.
/// * `value` - The value to set at the specified index.
pub fn set(&mut self, index: usize, value: u64) {
+ let value: u64 = value;
let start_block = index * self.bits_per_value / 64;
let start_block_offset = index * self.bits_per_value % 64;
@@ -142,11 +143,14 @@ impl BitArray {
pub fn clear(&mut self) {
self.data.iter_mut().for_each(|x| *x = 0);
}
+
+ pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] {
+ &self.data[start_slice..end_slice]
+ }
}
-/// Writes the data to a writer in a binary format using a bit array. This function is helpfull
-/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the
-/// specified capacity, so memory usage is minimized.
+/// Writes the data to a writer in a binary format using a bit array. The data is written
+/// in chunks of the specified capacity, so memory usage is minimized.
///
/// # Arguments
///
@@ -257,10 +261,10 @@ mod tests {
fn test_bitarray_set() {
let mut bitarray = BitArray::with_capacity(4, 40);
- bitarray.set(0, 0b0001110011111010110001000111111100110010);
- bitarray.set(1, 0b1100001001010010011000010100110111001001);
- bitarray.set(2, 0b1111001101001101101101101011101001010001);
- bitarray.set(3, 0b0000100010010001010001001110101110011100);
+ bitarray.set(0, 0b0001110011111010110001000111111100110010_u64);
+ bitarray.set(1, 0b1100001001010010011000010100110111001001_u64);
+ bitarray.set(2, 0b1111001101001101101101101011101001010001_u64);
+ bitarray.set(3, 0b0000100010010001010001001110101110011100_u64);
assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]);
}
diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs
index 98a1414..01cc3c4 100644
--- a/sa-builder/src/main.rs
+++ b/sa-builder/src/main.rs
@@ -21,7 +21,7 @@ fn main() {
eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
- let mut data = Proteins::try_from_database_file_without_annotations(&database_file)
+ let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml
index de57fc9..25dda76 100644
--- a/sa-index/Cargo.toml
+++ b/sa-index/Cargo.toml
@@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] }
rayon = "1.8.1"
serde = { version = "1.0.197", features = ["derive"] }
sa-mappings = { path = "../sa-mappings" }
+text-compression = { path = "../text-compression" }
bitarray = { path = "../bitarray" }
serde_json = "1.0.116"
diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs
index f276906..53f5348 100644
--- a/sa-index/src/lib.rs
+++ b/sa-index/src/lib.rs
@@ -115,11 +115,11 @@ mod tests {
#[test]
fn test_suffix_array_compressed() {
let mut bitarray = BitArray::with_capacity(5, 40);
- bitarray.set(0, 1);
- bitarray.set(1, 2);
- bitarray.set(2, 3);
- bitarray.set(3, 4);
- bitarray.set(4, 5);
+ bitarray.set(0, 1 as u64);
+ bitarray.set(1, 2 as u64);
+ bitarray.set(2, 3 as u64);
+ bitarray.set(3, 4 as u64);
+ bitarray.set(4, 5 as u64);
let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.len(), 5);
diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
index 0be7d17..2687de0 100644
--- a/sa-index/src/sa_searcher.rs
+++ b/sa-index/src/sa_searcher.rs
@@ -1,6 +1,7 @@
use std::{cmp::min, ops::Deref};
use sa_mappings::proteins::{Protein, Proteins, SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+use text_compression::ProteinTextSlice;
use crate::{
sa_searcher::BoundSearch::{Maximum, Minimum},
@@ -75,7 +76,7 @@ pub struct SparseSearcher(Searcher);
impl SparseSearcher {
pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
Self(searcher)
}
@@ -93,7 +94,7 @@ pub struct DenseSearcher(Searcher);
impl DenseSearcher {
pub fn new(sa: SuffixArray, proteins: Proteins) -> Self {
- let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
Self(searcher)
}
@@ -176,12 +177,12 @@ impl Searcher {
// match as long as possible
while index_in_search_string < search_string.len()
- && index_in_suffix < self.proteins.input_string.len()
- && (search_string[index_in_search_string] == self.proteins.input_string[index_in_suffix]
+ && index_in_suffix < self.proteins.text.len()
+ && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix)
|| (search_string[index_in_search_string] == b'L'
- && self.proteins.input_string[index_in_suffix] == b'I')
+ && self.proteins.text.get(index_in_suffix) == b'I')
|| (search_string[index_in_search_string] == b'I'
- && self.proteins.input_string[index_in_suffix] == b'L'))
+ && self.proteins.text.get(index_in_suffix) == b'L'))
{
index_in_suffix += 1;
index_in_search_string += 1;
@@ -191,7 +192,7 @@ impl Searcher {
if !search_string.is_empty() {
if index_in_search_string == search_string.len() {
is_cond_or_equal = true
- } else if index_in_suffix < self.proteins.input_string.len() {
+ } else if index_in_suffix < self.proteins.text.len() {
// in our index every L was replaced by a I, so we need to replace them if we want
// to search in the right direction
let peptide_char = if search_string[index_in_search_string] == b'L' {
@@ -200,10 +201,10 @@ impl Searcher {
search_string[index_in_search_string]
};
- let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' {
+ let protein_char = if self.proteins.text.get(index_in_suffix) == b'L' {
b'I'
} else {
- self.proteins.input_string[index_in_suffix]
+ self.proteins.text.get(index_in_suffix)
};
is_cond_or_equal = condition_check(peptide_char, protein_char);
@@ -347,20 +348,20 @@ impl Searcher {
// check at all
if (skip == 0
|| Self::check_prefix(
- current_search_string_prefix,
- &self.proteins.input_string[match_start..suffix],
- equate_il
- ))
+ current_search_string_prefix,
+ ProteinTextSlice::new(&self.proteins.text, match_start, suffix),
+ equate_il
+ ))
&& Self::check_suffix(
- skip,
- il_locations_current_suffix,
- current_search_string_suffix,
- &self.proteins.input_string[suffix..match_end],
- equate_il
- )
+ skip,
+ il_locations_current_suffix,
+ current_search_string_suffix,
+ ProteinTextSlice::new(&self.proteins.text, suffix, match_end),
+ equate_il
+ )
&& (!tryptic
- || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
- && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
+ || ((self.check_start_of_protein(match_start) || self.check_tryptic_cut(match_start))
+ && (self.check_end_of_protein(match_end) || self.check_tryptic_cut(match_end))))
{
matching_suffixes.push((suffix - skip) as i64);
@@ -394,7 +395,7 @@ impl Searcher {
/// Returns true if the cut is at the start of a protein.
#[inline]
fn check_start_of_protein(&self, cut_index: usize) -> bool {
- cut_index == 0 || self.proteins.input_string[cut_index - 1] == SEPARATION_CHARACTER
+ cut_index == 0 || self.proteins.text.get(cut_index - 1) == SEPARATION_CHARACTER
}
/// Check if a cut is the end of a protein.
@@ -407,8 +408,8 @@ impl Searcher {
/// Returns true if the cut is at the end of a protein.
#[inline]
fn check_end_of_protein(&self, cut_index: usize) -> bool {
- self.proteins.input_string[cut_index] == TERMINATION_CHARACTER
- || self.proteins.input_string[cut_index] == SEPARATION_CHARACTER
+ self.proteins.text.get(cut_index) == TERMINATION_CHARACTER
+ || self.proteins.text.get(cut_index) == SEPARATION_CHARACTER
}
/// Check if a cut is a tryptic cut, so check if the amino acid preceding the cut is K or R and the amino acid at the cut is not P.
@@ -421,8 +422,8 @@ impl Searcher {
/// Returns true if the cut is a tryptic cut.
#[inline]
fn check_tryptic_cut(&self, cut_index: usize) -> bool {
- (self.proteins.input_string[cut_index - 1] == b'K' || self.proteins.input_string[cut_index - 1] == b'R')
- && self.proteins.input_string[cut_index] != b'P'
+ (self.proteins.text.get(cut_index - 1) == b'K' || self.proteins.text.get(cut_index - 1) == b'R')
+ && self.proteins.text.get(cut_index) != b'P'
}
/// Returns true of the prefixes are the same
@@ -438,16 +439,8 @@ impl Searcher {
/// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise
/// false
#[inline]
- fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool {
- if equate_il {
- search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| {
- search_character == index_character
- || (search_character == b'I' && index_character == b'L')
- || (search_character == b'L' && index_character == b'I')
- })
- } else {
- search_string_prefix == index_prefix
- }
+ fn check_prefix(search_string_prefix: &[u8], index_prefix: ProteinTextSlice, equate_il: bool) -> bool {
+ index_prefix.equals_slice(search_string_prefix, equate_il)
}
/// Returns true of the search_string and index_string are equal
@@ -471,20 +464,10 @@ impl Searcher {
skip: usize,
il_locations: &[usize],
search_string: &[u8],
- index_string: &[u8],
+ text_slice: ProteinTextSlice,
equate_il: bool
) -> bool {
- if equate_il {
- true
- } else {
- for &il_location in il_locations {
- let index = il_location - skip;
- if search_string[index] != index_string[index] {
- return false;
- }
- }
- true
- }
+ if equate_il { true } else { text_slice.check_il_locations(skip, il_locations, search_string) }
}
/// Returns all the proteins that correspond with the provided suffixes
@@ -511,6 +494,7 @@ impl Searcher {
#[cfg(test)]
mod tests {
use sa_mappings::proteins::{Protein, Proteins};
+ use text_compression::ProteinText;
use crate::{
sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher},
@@ -539,9 +523,11 @@ mod tests {
}
fn get_example_proteins() -> Proteins {
- let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes();
+ let input_string = "AI-CLACVAA-AC-KCRLY$";
+ let text = ProteinText::from_string(input_string);
+
Proteins {
- input_string: text,
+ text,
proteins: vec![
Protein {
uniprot_id: String::new(),
@@ -572,7 +558,7 @@ mod tests {
let proteins = get_example_proteins();
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
// search bounds 'A'
@@ -593,7 +579,7 @@ mod tests {
let proteins = get_example_proteins();
let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
// search suffix 'VAA'
@@ -610,14 +596,14 @@ mod tests {
let proteins = get_example_proteins();
let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
let bounds_res = searcher.search_bounds(&[b'I']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16)));
// search bounds 'RIZ' with equal I and L
- let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Z']);
+ let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Y']);
assert_eq!(bounds_res, BoundSearchResult::SearchResult((17, 18)));
}
@@ -626,25 +612,26 @@ mod tests {
let proteins = get_example_proteins();
let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein));
// search bounds 'RIZ' with equal I and L
- let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true, false);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, true, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16]));
// search bounds 'RIZ' without equal I and L
- let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false, false);
+ let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, false, false);
assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches);
}
// test edge case where an I or L is the first index in the sparse SA.
#[test]
fn test_l_first_index_in_sa() {
- let text = "LMOXZ$".to_string().into_bytes();
+ let input_string = "LMPYY$";
+ let text = ProteinText::from_string(input_string);
let proteins = Proteins {
- input_string: text,
+ text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
@@ -653,7 +640,7 @@ mod tests {
};
let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
// search bounds 'IM' with equal I and L
@@ -663,10 +650,11 @@ mod tests {
#[test]
fn test_il_missing_matches() {
- let text = "AAILLL$".to_string().into_bytes();
+ let input_string = "AAILLL$";
+ let text = ProteinText::from_string(input_string);
let proteins = Proteins {
- input_string: text,
+ text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
@@ -675,7 +663,7 @@ mod tests {
};
let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true, false);
@@ -684,10 +672,11 @@ mod tests {
#[test]
fn test_il_duplication() {
- let text = "IIIILL$".to_string().into_bytes();
+ let input_string = "IIIILL$";
+ let text = ProteinText::from_string(input_string);
let proteins = Proteins {
- input_string: text,
+ text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
@@ -696,7 +685,7 @@ mod tests {
};
let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true, false);
@@ -705,10 +694,11 @@ mod tests {
#[test]
fn test_il_suffix_check() {
- let text = "IIIILL$".to_string().into_bytes();
+ let input_string = "IIIILL$";
+ let text = ProteinText::from_string(input_string);
let proteins = Proteins {
- input_string: text,
+ text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
@@ -717,7 +707,7 @@ mod tests {
};
let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
// search all places where II is in the string IIIILL, but with a sparse SA
@@ -728,10 +718,11 @@ mod tests {
#[test]
fn test_il_duplication2() {
- let text = "IILLLL$".to_string().into_bytes();
+ let input_string = "IILLLL$";
+ let text = ProteinText::from_string(input_string);
let proteins = Proteins {
- input_string: text,
+ text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
@@ -740,7 +731,7 @@ mod tests {
};
let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
// search bounds 'IM' with equal I and L
@@ -750,10 +741,11 @@ mod tests {
#[test]
fn test_tryptic_search() {
- let text = "PAA-AAKPKAPAA$".to_string().into_bytes();
+ let input_string = "PAA-AAKPKAPAA$";
+ let text = ProteinText::from_string(input_string);
let proteins = Proteins {
- input_string: text,
+ text,
proteins: vec![Protein {
uniprot_id: String::new(),
taxon_id: 0,
@@ -762,7 +754,7 @@ mod tests {
};
let sparse_sa = SuffixArray::Original(vec![13, 3, 12, 11, 1, 4, 2, 5, 9, 8, 6, 10, 0, 7], 1);
- let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string);
+ let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text);
let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein));
let found_suffixes_1 = searcher.search_matching_suffixes(&[b'P', b'A', b'A'], usize::MAX, false, true);
diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs
index 121b569..a6a4e93 100644
--- a/sa-index/src/suffix_to_protein_index.rs
+++ b/sa-index/src/suffix_to_protein_index.rs
@@ -1,5 +1,6 @@
use clap::ValueEnum;
use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+use text_compression::ProteinText;
use crate::Nullable;
@@ -66,10 +67,10 @@ impl DenseSuffixToProtein {
/// # Returns
///
/// Returns a new DenseSuffixToProtein build over the provided text
- pub fn new(text: &[u8]) -> Self {
+ pub fn new(text: &ProteinText) -> Self {
let mut current_protein_index: u32 = 0;
let mut suffix_index_to_protein: Vec = vec![];
- for &char in text.iter() {
+ for char in text.iter() {
if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
current_protein_index += 1;
suffix_index_to_protein.push(u32::NULL);
@@ -92,9 +93,9 @@ impl SparseSuffixToProtein {
/// # Returns
///
/// Returns a new SparseSuffixToProtein build over the provided text
- pub fn new(text: &[u8]) -> Self {
+ pub fn new(text: &ProteinText) -> Self {
let mut suffix_index_to_protein: Vec = vec![0];
- for (index, &char) in text.iter().enumerate() {
+ for (index, char) in text.iter().enumerate() {
if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER {
suffix_index_to_protein.push(index as i64 + 1);
}
@@ -108,6 +109,7 @@ impl SparseSuffixToProtein {
mod tests {
use clap::ValueEnum;
use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER};
+ use text_compression::ProteinText;
use crate::{
suffix_to_protein_index::{
@@ -116,10 +118,10 @@ mod tests {
Nullable
};
- fn build_text() -> Vec {
+ fn build_text() -> ProteinText {
let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char));
text.push(TERMINATION_CHARACTER as char);
- text.into_bytes()
+ ProteinText::from_string(&text)
}
#[test]
diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml
index b20a2bf..d255f7c 100644
--- a/sa-mappings/Cargo.toml
+++ b/sa-mappings/Cargo.toml
@@ -11,3 +11,5 @@ tempdir = "0.3.7"
[dependencies]
fa-compression = { path = "../fa-compression" }
bytelines = "2.5.0"
+bitarray = { path = "../bitarray" }
+text-compression = { path = "../text-compression" }
diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs
index f2b24cc..53e52b8 100644
--- a/sa-mappings/src/proteins.rs
+++ b/sa-mappings/src/proteins.rs
@@ -5,6 +5,7 @@ use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8};
use bytelines::ByteLines;
use fa_compression::algorithm1::{decode, encode};
+use text_compression::ProteinText;
/// The separation character used in the input string
pub static SEPARATION_CHARACTER: u8 = b'-';
@@ -28,7 +29,7 @@ pub struct Protein {
/// A struct that represents a collection of proteins
pub struct Proteins {
/// The input string containing all proteins
- pub input_string: Vec,
+ pub text: ProteinText,
/// The proteins in the input string
pub proteins: Vec
@@ -46,7 +47,6 @@ impl Proteins {
///
/// # Arguments
/// * `file` - The path to the database file
- /// * `taxon_aggregator` - The `TaxonAggregator` to use
///
/// # Returns
///
@@ -86,16 +86,52 @@ impl Proteins {
input_string.pop();
input_string.push(TERMINATION_CHARACTER.into());
- input_string.shrink_to_fit();
proteins.shrink_to_fit();
- Ok(Self { input_string: input_string.into_bytes(), proteins })
+
+ let text = ProteinText::from_string(&input_string);
+ Ok(Self { text, proteins })
+ }
+
+ /// Creates a `ProteinText` which represents all the proteins concatenated from the database file
+ ///
+ /// # Arguments
+ /// * `file` - The path to the database file
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Result` containing the `ProteinText`
+ ///
+ /// # Errors
+ ///
+ /// Returns a `Box` if an error occurred while reading the database file
+ pub fn try_from_database_file_without_annotations(database_file: &str) -> Result> {
+ let mut input_string: String = String::new();
+
+ let file = File::open(database_file)?;
+
+ // Read the lines as bytes, since the input string is not guaranteed to be utf8
+ // because of the encoded functional annotations
+ let mut lines = ByteLines::new(BufReader::new(file));
+
+ while let Some(Ok(line)) = lines.next() {
+ let mut fields = line.split(|b| *b == b'\t');
+
+ // only get the taxon id and sequence from each line, we don't need the other parts
+ let sequence = from_utf8(fields.nth(2).unwrap())?;
+
+ input_string.push_str(&sequence.to_uppercase());
+ input_string.push(SEPARATION_CHARACTER.into());
+ }
+
+ let text = ProteinText::from_string(&input_string);
+
+ Ok(text)
}
/// Creates a `vec` which represents all the proteins concatenated from the database file
///
/// # Arguments
/// * `file` - The path to the database file
- /// * `taxon_aggregator` - The `TaxonAggregator` to use
///
/// # Returns
///
@@ -104,7 +140,7 @@ impl Proteins {
/// # Errors
///
/// Returns a `Box` if an error occurred while reading the database file
- pub fn try_from_database_file_without_annotations(database_file: &str) -> Result, Box> {
+ pub fn try_from_database_file_uncompressed(database_file: &str) -> Result, Box> {
let mut input_string: String = String::new();
let file = File::open(database_file)?;
@@ -181,8 +217,10 @@ mod tests {
#[test]
fn test_new_proteins() {
+ let input_string = "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG";
+ let text = ProteinText::from_string(&input_string);
let proteins = Proteins {
- input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes().to_vec(),
+ text,
proteins: vec![
Protein {
uniprot_id: "P12345".to_string(),
@@ -197,7 +235,6 @@ mod tests {
]
};
- assert_eq!(proteins.input_string, "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes());
assert_eq!(proteins.proteins.len(), 2);
assert_eq!(proteins[0].uniprot_id, "P12345");
assert_eq!(proteins[0].taxon_id, 1);
@@ -245,12 +282,7 @@ mod tests {
let proteins = Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap()).unwrap();
- let sep_char = SEPARATION_CHARACTER as char;
- let end_char = TERMINATION_CHARACTER as char;
- let expected = format!(
- "MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}",
- sep_char, sep_char, sep_char, end_char
- );
- assert_eq!(proteins, expected.as_bytes());
+ let expected = 'L' as u8;
+ assert_eq!(proteins.get(4), expected);
}
}
diff --git a/text-compression/Cargo.toml b/text-compression/Cargo.toml
new file mode 100644
index 0000000..c312a3c
--- /dev/null
+++ b/text-compression/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "text-compression"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+bitarray = { path = "../bitarray" }
diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
new file mode 100644
index 0000000..4866a6c
--- /dev/null
+++ b/text-compression/src/lib.rs
@@ -0,0 +1,632 @@
+use std::{
+ collections::HashMap,
+ error::Error,
+ io::{BufRead, Write}
+};
+
+use bitarray::{data_to_writer, Binary, BitArray};
+
+/// Structure representing the proteins, stored in a bit array using 5 bits per amino acid.
+pub struct ProteinText {
+ /// Bit array holding the sequence of amino acids
+ bit_array: BitArray,
+ /// Hashmap storing the mapping between the character as `u8` and a 5 bit number.
+ char_to_5bit: HashMap,
+ /// Vector storing the mapping between the 5 bit number and the character as `u8`.
+ bit5_to_char: Vec
+}
+
+impl ProteinText {
+ /// Creates the hashmap storing the mappings between the characters as `u8` and 5 bit numbers.
+ ///
+ /// # Returns
+ ///
+ /// Returns the hashmap
+ fn create_char_to_5bit_hashmap() -> HashMap {
+ let mut hashmap = HashMap::::new();
+ for (i, c) in "ACDEFGHIKLMNPQRSTVWY-$".chars().enumerate() {
+ hashmap.insert(c as u8, i as u8);
+ }
+
+ hashmap
+ }
+
+ /// Creates the vector storing the mappings between the 5 bit numbers and the characters as `u8`.
+ ///
+ /// # Returns
+ ///
+ /// Returns the vector
+ fn create_bit5_to_char() -> Vec {
+ let mut vec = Vec::::new();
+ for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+ vec.push(c as u8);
+ }
+ vec
+ }
+
+ /// Creates the compressed text from a string.
+ ///
+ /// # Arguments
+ /// * `input_string` - The text (proteins) in string format
+ ///
+ /// # Returns
+ ///
+ /// An instance of `ProteinText`
+ pub fn from_string(input_string: &str) -> ProteinText {
+ let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+ let bit5_to_char = ProteinText::create_bit5_to_char();
+
+ let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
+ for (i, c) in input_string.chars().enumerate() {
+ let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+ bit_array.set(i, char_5bit as u64);
+ }
+
+ Self { bit_array, char_to_5bit, bit5_to_char }
+ }
+
+ /// Creates the compressed text from a vector.
+ ///
+ /// # Arguments
+ /// * `input_vec` - The text (proteins) in a vector with elements of type `u8` representing the amino acids.
+ ///
+ /// # Returns
+ ///
+ /// An instance of `ProteinText`
+ pub fn from_vec(input_vec: &[u8]) -> ProteinText {
+ let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+ let bit5_to_char = ProteinText::create_bit5_to_char();
+
+ let mut bit_array = BitArray::with_capacity(input_vec.len(), 5);
+ for (i, e) in input_vec.iter().enumerate() {
+ let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet");
+ bit_array.set(i, char_5bit as u64);
+ }
+
+ Self { bit_array, char_to_5bit, bit5_to_char }
+ }
+
+ /// Creates the compressed text from a bit array.
+ ///
+ /// # Arguments
+ /// * `bit_array` - The text (proteins) in a bit array using 5 bits for each amino acid.
+ ///
+ /// # Returns
+ ///
+ /// An instance of `ProteinText`
+ pub fn new(bit_array: BitArray) -> ProteinText {
+ let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+ let bit5_to_char = ProteinText::create_bit5_to_char();
+ Self { bit_array, char_to_5bit, bit5_to_char }
+ }
+
+ /// Creates an instance of `ProteinText` with a given capacity.
+ ///
+ /// # Arguments
+ /// * `capacity` - The amount of characters in the text.
+ ///
+ /// # Returns
+ ///
+ /// An instance of `ProteinText`
+ pub fn with_capacity(capacity: usize) -> Self {
+ Self::new(BitArray::with_capacity(capacity, 5))
+ }
+
+ /// Search the character at a given position in the compressed text.
+ ///
+ /// # Arguments
+ /// * `index` - The index of the character to search.
+ ///
+ /// # Returns
+ ///
+ /// the character at position `index` as `u8`.
+ pub fn get(&self, index: usize) -> u8 {
+ let char_5bit = self.bit_array.get(index) as usize;
+ self.bit5_to_char[char_5bit]
+ }
+
+ /// Set the character at a given index.
+ ///
+ /// # Arguments
+ /// * `index` - The index of the character to change.
+ /// * `value` - The character to fill in as `u8`.
+ pub fn set(&mut self, index: usize, value: u8) {
+ let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet");
+ self.bit_array.set(index, char_5bit as u64);
+ }
+
+ /// Queries the length of the text.
+ ///
+ /// # Returns
+ ///
+ /// the length of the text
+ pub fn len(&self) -> usize {
+ self.bit_array.len()
+ }
+
+ /// Check if the text is empty (length 0).
+ ///
+ /// # Returns
+ ///
+ /// true if the the text has length 0, false otherwise.
+ pub fn is_empty(&self) -> bool {
+ self.bit_array.len() == 0
+ }
+
+ /// Clears the `BitArray`, setting all bits to 0.
+ pub fn clear(&mut self) {
+ self.bit_array.clear()
+ }
+
+ /// Get an iterator over the characters of the text.
+ ///
+ /// # Returns
+ ///
+ /// A `ProteinTextIterator`, which can iterate over the characters of the text.
+ pub fn iter(&self) -> ProteinTextIterator {
+ ProteinTextIterator { protein_text: self, index: 0 }
+ }
+
+ /// Get a slice of the text
+ ///
+ /// # Returns
+ ///
+ /// An `ProteinTextSlice` representing a slice of the text.
+ pub fn slice(&self, start: usize, end: usize) -> ProteinTextSlice {
+ ProteinTextSlice::new(self, start, end)
+ }
+}
+
+/// Structure representing a slice of a `ProteinText`.
+pub struct ProteinTextSlice<'a> {
+ /// The `Proteintext` of whih to take a slice.
+ text: &'a ProteinText,
+ /// The start of the slice.
+ start: usize, // included
+ /// The end of the slice.
+ end: usize // excluded
+}
+
+impl<'a> ProteinTextSlice<'a> {
+ /// Creates an instance of `ProteintextSlice`, given the text and boundaries.
+ ///
+ /// # Arguments
+ /// * `text` - The `Proteintext` representing the text of proteins with 5 bits per amino acid.
+ /// * `start` - The start of the slice.
+ /// * `end` - The end of the slice.
+ ///
+ /// # Returns
+ ///
+ /// An instance of `ProteinTextSlice`
+ pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice<'a> {
+ Self { text, start, end }
+ }
+
+ /// Get a character (amino acid) in the slice.
+ ///
+ /// # Arguments
+ /// * `index` - The index in the slice of the character to get.
+ ///
+ /// # Returns
+ ///
+ /// The character as `u8`.
+ pub fn get(&self, index: usize) -> u8 {
+ self.text.get(self.start + index)
+ }
+
+ /// Get the length of the slice.
+ ///
+ /// # Returns
+ ///
+ /// The length of the slice.
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Checks if the slice and a given array of `u8` are equal.
+ /// I and L can be equated.
+ ///
+ /// # Arguments
+ /// * `other` - the array of `u8` to compare the slice with.
+ /// * `equate_il` - true if I and L need to be equated, false otherwise.
+ ///
+ /// # Returns
+ ///
+ /// True if the slice is equal to the given array, false otherwise.
+ #[inline]
+ pub fn equals_slice(&self, other: &[u8], equate_il: bool) -> bool {
+ if equate_il {
+ other.iter().zip(self.iter()).all(|(&search_character, text_character)| {
+ search_character == text_character
+ || (search_character == b'I' && text_character == b'L')
+ || (search_character == b'L' && text_character == b'I')
+ })
+ } else {
+ other
+ .iter()
+ .zip(self.iter())
+ .all(|(&search_character, text_character)| search_character == text_character)
+ }
+ }
+
+ /// Check if the slice and a given array of `u8` are equal on the I and L positions.
+ ///
+ /// # Arguments
+ /// * `skip` - The amount of positions this slice skipped, this has an influence on the I and L positions.
+ /// * `il_locations` - The positions where I and L occur.
+ /// * `search_string` - An array of `u8` to compare the slice with.
+ ///
+ /// # Returns
+ ///
+ /// True if the slice and `search_string` have the same contents on the I and L positions, false otherwise.
+ pub fn check_il_locations(&self, skip: usize, il_locations: &[usize], search_string: &[u8]) -> bool {
+ for &il_location in il_locations {
+ let index = il_location - skip;
+ if search_string[index] != self.get(index) {
+ return false;
+ }
+ }
+ true
+ }
+
+ /// Get an iterator over the slice.
+ ///
+ /// # Returns
+ ///
+ /// An iterator over the slice.
+ pub fn iter(&self) -> ProteinTextSliceIterator {
+ ProteinTextSliceIterator { text_slice: self, index: 0 }
+ }
+}
+
+/// Structure representing an iterator over a `ProteinText` instance, iterating the characters of the text.
+pub struct ProteinTextIterator<'a> {
+ protein_text: &'a ProteinText,
+ index: usize
+}
+
+/// Structure representing an iterator over a `ProteintextSlice` instance, iterating the characters of the slice.
+pub struct ProteinTextSliceIterator<'a> {
+ text_slice: &'a ProteinTextSlice<'a>,
+ index: usize
+}
+
+impl<'a> Iterator for ProteinTextSliceIterator<'a> {
+ type Item = u8;
+
+ /// Get the next character in the `ProteinTextSlice`.
+ ///
+ /// # Returns
+ ///
+ /// The next character in the slice.
+ fn next(&mut self) -> Option {
+ if self.index >= self.text_slice.len() {
+ return None;
+ }
+
+ self.index += 1;
+ Some(self.text_slice.get(self.index - 1))
+ }
+}
+
+impl<'a> Iterator for ProteinTextIterator<'a> {
+ type Item = u8;
+
+ /// Get the next character in the `ProteinText`.
+ ///
+ /// # Returns
+ ///
+ /// The next character in the text.
+ fn next(&mut self) -> Option {
+ if self.index >= self.protein_text.len() {
+ return None;
+ }
+
+ self.index += 1;
+ Some(self.protein_text.get(self.index - 1))
+ }
+}
+
+/// Writes the compressed text to a writer.
+///
+/// # Arguments
+///
+/// * `text` - The text to be compressed.
+/// * `writer` - The writer to which the compressed text will be written.
+///
+/// # Errors
+///
+/// Returns an error if writing to the writer fails.
+pub fn dump_compressed_text(text: Vec, writer: &mut impl Write) -> Result<(), Box> {
+ let bits_per_value = 5;
+
+ // Write the flags to the writer
+ // 00000001 indicates that the text is compressed
+ writer
+ .write(&[bits_per_value as u8])
+ .map_err(|_| "Could not write the required bits to the writer")?;
+
+ // Write the size of the text to the writer
+ writer
+ .write(&(text.len() as u64).to_le_bytes())
+ .map_err(|_| "Could not write the size of the text to the writer")?;
+
+ // Compress the text and write it to the writer
+ let text_writer: Vec = text.iter().map(|item| ::from(*item)).collect();
+ data_to_writer(text_writer, bits_per_value, 8 * 1024, writer)
+ .map_err(|_| "Could not write the compressed text to the writer")?;
+
+ Ok(())
+}
+
+/// Load the compressed text from a reader.
+///
+/// # Arguments
+///
+/// * `reader` - The reader from which the compressed text will be read.
+///
+/// # Errors
+///
+/// Returns an error if reading from the reader fails.
+pub fn load_compressed_text(reader: &mut impl BufRead) -> Result> {
+ let bits_per_value: usize = 5;
+ // Read the size of the text from the binary file (8 bytes)
+ let mut size_buffer = [0_u8; 8];
+ reader
+ .read_exact(&mut size_buffer)
+ .map_err(|_| "Could not read the size of the text from the binary file")?;
+ let size = u64::from_le_bytes(size_buffer) as usize;
+
+ // Read the compressed text from the binary file
+ let mut compressed_text = BitArray::with_capacity(size, bits_per_value);
+ compressed_text
+ .read_binary(reader)
+ .map_err(|_| "Could not read the compressed text from the binary file")?;
+
+ Ok(ProteinText::new(compressed_text))
+}
+
+#[cfg(test)]
+mod tests {
+ use std::io::Read;
+
+ use super::*;
+
+ pub struct FailingWriter {
+ /// The number of times the write function can be called before it fails.
+ pub valid_write_count: usize
+ }
+
+ impl Write for FailingWriter {
+ fn write(&mut self, _: &[u8]) -> Result {
+ if self.valid_write_count == 0 {
+ return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed"));
+ }
+
+ self.valid_write_count -= 1;
+ Ok(1)
+ }
+
+ fn flush(&mut self) -> Result<(), std::io::Error> {
+ Ok(())
+ }
+ }
+
+ pub struct FailingReader {
+ /// The number of times the read function can be called before it fails.
+ pub valid_read_count: usize
+ }
+
+ impl Read for FailingReader {
+ fn read(&mut self, buf: &mut [u8]) -> std::io::Result {
+ if self.valid_read_count == 0 {
+ return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed"));
+ }
+
+ self.valid_read_count -= 1;
+ Ok(buf.len())
+ }
+ }
+
+ impl BufRead for FailingReader {
+ fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
+ Ok(&[])
+ }
+
+ fn consume(&mut self, _: usize) {}
+ }
+
+ #[test]
+ fn test_u8_5bit_conversion() {
+ let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+ let bit5_to_char = ProteinText::create_bit5_to_char();
+
+ for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() {
+ let char_5bit = char_to_5bit.get(&(c as u8)).unwrap();
+ assert_eq!(c as u8, bit5_to_char[*char_5bit as usize]);
+ }
+ }
+
+ #[test]
+ fn test_build_from_string() {
+ let text = ProteinText::from_string("ACACA-CAC$");
+
+ for (i, c) in "ACACA-CAC$".chars().enumerate() {
+ assert_eq!(c as u8, text.get(i));
+ }
+ }
+
+ #[test]
+ fn test_build_from_vec() {
+ let vec = vec![b'A', b'C', b'A', b'C', b'A', b'-', b'C', b'A', b'C', b'$'];
+ let text = ProteinText::from_vec(&vec);
+
+ for (i, c) in "ACACA-CAC$".chars().enumerate() {
+ assert_eq!(c as u8, text.get(i));
+ }
+ }
+
+ #[test]
+ fn test_build_from_bitarray() {
+ let input_string = "ACACA-CAC$";
+ let char_to_5bit = ProteinText::create_char_to_5bit_hashmap();
+
+ let mut bit_array = BitArray::with_capacity(input_string.len(), 5);
+ for (i, c) in input_string.chars().enumerate() {
+ let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet");
+ bit_array.set(i, char_5bit as u64);
+ }
+
+ let text = ProteinText::new(bit_array);
+
+ for (i, c) in "ACACA-CAC$".chars().enumerate() {
+ assert_eq!(c as u8, text.get(i));
+ }
+ }
+
+ #[test]
+ fn test_build_with_capacity() {
+ let input_string = "ACACA-CAC$";
+
+ let mut text = ProteinText::with_capacity(input_string.len());
+ for (i, c) in "ACACA-CAC$".chars().enumerate() {
+ text.set(i, c as u8);
+ }
+
+ for (i, c) in "ACACA-CAC$".chars().enumerate() {
+ assert_eq!(c as u8, text.get(i));
+ }
+ }
+
+ #[test]
+ fn test_text_slice() {
+ let input_string = "ACACA-CAC$";
+ let start = 1;
+ let end = 5;
+ let text = ProteinText::from_string(&input_string);
+ let text_slice = text.slice(start, end);
+
+ for (i, c) in input_string[start..end].chars().enumerate() {
+ assert_eq!(c as u8, text_slice.get(i));
+ }
+ }
+
+ #[test]
+ fn test_equals_slice() {
+ let input_string = "ACICA-CAC$";
+ let text = ProteinText::from_string(&input_string);
+ let text_slice = text.slice(1, 5);
+ let eq_slice_true = [b'C', b'I', b'C', b'A'];
+ let eq_slice_false = [b'C', b'C', b'C', b'A'];
+ let eq_slice_il_true = [b'C', b'L', b'C', b'A'];
+
+ assert!(text_slice.equals_slice(&eq_slice_true, false));
+ assert!(!text_slice.equals_slice(&eq_slice_false, false));
+ assert!(text_slice.equals_slice(&eq_slice_il_true, true));
+ }
+
+ #[test]
+ fn test_check_il_locations() {
+ let input_string = "ACILA-CAC$";
+ let text = ProteinText::from_string(&input_string);
+ let text_slice = text.slice(1, 5);
+ let il_locations = [1, 2];
+ let il_true = [b'C', b'I', b'L', b'A'];
+ let il_false = [b'C', b'I', b'C', b'A'];
+
+ assert!(text_slice.check_il_locations(0, &il_locations, &il_true));
+ assert!(!text_slice.check_il_locations(0, &il_locations, &il_false));
+ }
+
+ #[test]
+ fn test_dump_compressed_text() {
+ let text: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+
+ let mut writer = vec![];
+ dump_compressed_text(text, &mut writer).unwrap();
+
+ assert_eq!(writer, vec![
+ // bits per value
+ 5, // size of the text
+ 10, 0, 0, 0, 0, 0, 0, 0, // compressed text
+ 0, 128, 74, 232, 152, 66, 134, 8
+ ]);
+ }
+
+ #[test]
+ #[should_panic(expected = "Could not write the required bits to the writer")]
+ fn test_dump_compressed_text_fail_required_bits() {
+ let mut writer = FailingWriter { valid_write_count: 0 };
+
+ dump_compressed_text(vec![], &mut writer).unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Could not write the size of the text to the writer")]
+ fn test_dump_compressed_text_fail_size() {
+ let mut writer = FailingWriter { valid_write_count: 1 };
+
+ dump_compressed_text(vec![], &mut writer).unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Could not write the compressed text to the writer")]
+ fn test_dump_compressed_text_fail_compressed_text() {
+ let mut writer = FailingWriter { valid_write_count: 3 };
+
+ dump_compressed_text(vec![1], &mut writer).unwrap();
+ }
+
+ #[test]
+ fn test_load_compressed_text() {
+ let data = vec![
+ // size of the text
+ 10, 0, 0, 0, 0, 0, 0, 0, // compressed text
+ 0, 128, 74, 232, 152, 66, 134, 8,
+ ];
+
+ let mut reader = std::io::BufReader::new(&data[..]);
+ let compressed_text = load_compressed_text(&mut reader).unwrap();
+
+ for (i, c) in "CDEFGHIKLM".chars().enumerate() {
+ assert_eq!(compressed_text.get(i), c as u8);
+ }
+ }
+
+ #[test]
+ #[should_panic(expected = "Could not read the size of the text from the binary file")]
+ fn test_load_compressed_text_fail_size() {
+ let mut reader = FailingReader { valid_read_count: 0 };
+
+ load_compressed_text(&mut reader).unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Could not read the compressed text from the binary file")]
+ fn test_load_compressed_text_fail_compressed_text() {
+ let mut reader = FailingReader { valid_read_count: 2 };
+
+ load_compressed_text(&mut reader).unwrap();
+ }
+
+ #[test]
+ fn test_failing_writer() {
+ let mut writer = FailingWriter { valid_write_count: 0 };
+ assert!(writer.flush().is_ok());
+ assert!(writer.write(&[0]).is_err());
+ }
+
+ #[test]
+ fn test_failing_reader() {
+ let mut reader = FailingReader { valid_read_count: 0 };
+ let right_buffer: [u8; 0] = [];
+ assert_eq!(reader.fill_buf().unwrap(), &right_buffer);
+ assert_eq!(reader.consume(0), ());
+ let mut buffer = [0_u8; 1];
+ assert!(reader.read(&mut buffer).is_err());
+ }
+}
diff --git a/unipept-index.iml b/unipept-index.iml
new file mode 100644
index 0000000..ce5666f
--- /dev/null
+++ b/unipept-index.iml
@@ -0,0 +1,24 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file