From 3326ac8398f681ce6ee8cee4cf64614512a3f58a Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 7 Aug 2024 14:02:10 +0200 Subject: [PATCH 01/27] Implement GitHub action that automatically builds sa binaries --- .devcontainer/devcontainer.json | 23 +++++++++ .github/workflows/build_index.yml | 82 +++++++++++++++++++++++++++++++ .idea/.gitignore | 8 +++ .idea/misc.xml | 6 +++ .idea/modules.xml | 8 +++ .idea/vcs.xml | 6 +++ unipept-index.iml | 9 ++++ 7 files changed, 142 insertions(+) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .github/workflows/build_index.yml create mode 100644 .idea/.gitignore create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 unipept-index.iml diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..a9084f7 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,23 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda +{ + "name": "Unipept Index", + "image": "mcr.microsoft.com/devcontainers/base:ubuntu", + + // Features to add to the dev container. More info: https://containers.dev/features. + "features": { + "ghcr.io/devcontainers/features/rust:1": {} + }, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml new file mode 100644 index 0000000..236921f --- /dev/null +++ b/.github/workflows/build_index.yml @@ -0,0 +1,82 @@ +name: Build index binaries + +on: + schedule: + # Run on the first day of every month at midnight UTC + - cron: '0 0 1 * *' + push: + branches: + - feature/build_index_action + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + # Check out the most recent version of the repository with submodules + - name: Check out repository + uses: actions/checkout@v3 + with: + submodules: recursive + + # Set up Rust toolchain + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + + # Compile Rust code + - name: Compile Rust code + run: cargo build --release + + # Create a directory "build" + - name: Create build directory + run: mkdir -p build/input + + # Download the file "suffix-array.zip" from the most recent release of "unipept-database" + - name: Download suffix-array.zip + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + latest_release_url=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep "browser_download_url.*suffix-array.zip" | cut -d '"' -f 4) + release_date=$(curl -s https://api.github.com/repos/unipept/unipept-database/releases/latest | grep '"published_at":' | cut -d '"' -f 4 | cut -d'T' -f1) + release_date_formatted=$(date -d $release_date "+%Y-%m-%d") + SP_VERSION="SP_$release_date_formatted" + echo "SP_VERSION=$SP_VERSION" >> $GITHUB_ENV + curl -L -o build/suffix-array.zip $latest_release_url + + # Extract the contents of this zip into a folder "build/input" + - name: Extract zip contents + run: unzip build/suffix-array.zip -d build/input + + # Make a directory with the SP_VERSION and process files + - name: Process files + run: | + mkdir -p build/$SP_VERSION + lz4 -d build/input/uniprot_entries.tsv.lz4 | cut -f2,4,7,8 > build/$SP_VERSION/proteins.tsv + lz4 -d build/input/taxons.tsv.lz4 > build/$SP_VERSION/taxons.tsv + + # Step 8: Run the sa-builder command + - name: Run sa-builder + run: | + prefix="build/$SP_VERSION" + ./target/release/sa-builder -d "$prefix/proteins.tsv" -t "$prefix/taxons.tsv" -o "$prefix/sa_sparse3_compressed.bin" -s 3 -a lib-div-suf-sort -c + + # Zip the contents of the build/$SP_VERSION directory + - name: Zip build contents + run: | + prefix="build/$SP_VERSION" + zip -r "build/$SP_VERSION.zip" "$prefix" + + # Create a GitHub release and upload the zip file + - name: Upload or Update Release + id: upload_or_update_release + uses: softprops/action-gh-release@v1 + with: + files: build/$SP_VERSION.zip + tag_name: index-${{ env.SP_VERSION }} + name: Index ${{ env.SP_VERSION }} + commitish: ${{ github.sha }} + draft: false + prerelease: false + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..639900d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..92b44ab --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/unipept-index.iml b/unipept-index.iml new file mode 100644 index 0000000..8021953 --- /dev/null +++ b/unipept-index.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file From 1d51202d5113ebeca6336f65187c99351c7516c7 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 7 Aug 2024 14:06:25 +0200 Subject: [PATCH 02/27] Updated the unzip command to take into account output subdir --- .github/workflows/build_index.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 236921f..9f81d7c 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -44,9 +44,12 @@ jobs: echo "SP_VERSION=$SP_VERSION" >> $GITHUB_ENV curl -L -o build/suffix-array.zip $latest_release_url - # Extract the contents of this zip into a folder "build/input" + # Extract the contents of the output folder from the zip into a folder "build/input" - name: Extract zip contents - run: unzip build/suffix-array.zip -d build/input + run: | + unzip build/suffix-array.zip 'output/*' -d build/temp + mv build/temp/output/* build/input/ + rm -r build/temp # Make a directory with the SP_VERSION and process files - name: Process files From fe6da8fbc9b083c4ccfbc9e3283671b3f6921b9a Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 7 Aug 2024 14:10:28 +0200 Subject: [PATCH 03/27] taxons file no longer required for building index --- .github/workflows/build_index.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 9f81d7c..b199cd8 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -62,7 +62,7 @@ jobs: - name: Run sa-builder run: | prefix="build/$SP_VERSION" - ./target/release/sa-builder -d "$prefix/proteins.tsv" -t "$prefix/taxons.tsv" -o "$prefix/sa_sparse3_compressed.bin" -s 3 -a lib-div-suf-sort -c + ./target/release/sa-builder -d "$prefix/proteins.tsv" -o "$prefix/sa_sparse3_compressed.bin" -s 3 -a lib-div-suf-sort -c # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents From b8e56b010aa692fc6d9e053e3f98fa6c39b59284 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 7 Aug 2024 14:15:04 +0200 Subject: [PATCH 04/27] Use correct environment variable when uploading file --- .github/workflows/build_index.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index b199cd8..2cc4f2c 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -75,7 +75,7 @@ jobs: id: upload_or_update_release uses: softprops/action-gh-release@v1 with: - files: build/$SP_VERSION.zip + files: build/${{ env.SP_VERSION }}.zip tag_name: index-${{ env.SP_VERSION }} name: Index ${{ env.SP_VERSION }} commitish: ${{ github.sha }} From 83ff845990f5c6f76038af54e6421caf12994354 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 7 Aug 2024 14:21:11 +0200 Subject: [PATCH 05/27] Update zip name --- .github/workflows/build_index.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 2cc4f2c..ab50afa 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -67,15 +67,14 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - prefix="build/$SP_VERSION" - zip -r "build/$SP_VERSION.zip" "$prefix" + cd build && zip -r "index_$SP_VERSION.zip" "$SP_VERSION" # Create a GitHub release and upload the zip file - name: Upload or Update Release id: upload_or_update_release uses: softprops/action-gh-release@v1 with: - files: build/${{ env.SP_VERSION }}.zip + files: build/index_${{ env.SP_VERSION }}.zip tag_name: index-${{ env.SP_VERSION }} name: Index ${{ env.SP_VERSION }} commitish: ${{ github.sha }} From bdc1d5ca45157cc25165d9a648b88974a9d712a8 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Fri, 9 Aug 2024 13:45:45 +0200 Subject: [PATCH 06/27] Remove redundant subdir from zipped folder --- .github/workflows/build_index.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index ab50afa..dd093b3 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -67,7 +67,7 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - cd build && zip -r "index_$SP_VERSION.zip" "$SP_VERSION" + cd build && zip -r "index_$SP_VERSION.zip" "$SP_VERSION/*.tsv.lz4" # Create a GitHub release and upload the zip file - name: Upload or Update Release From 3e812a5945b885eb7166b518dbc27a1fd0e1703b Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Fri, 9 Aug 2024 13:47:48 +0200 Subject: [PATCH 07/27] Files are tsv instead of tsv.lz4 --- .github/workflows/build_index.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index dd093b3..25c0b76 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -67,7 +67,7 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - cd build && zip -r "index_$SP_VERSION.zip" "$SP_VERSION/*.tsv.lz4" + cd build && zip -r "index_$SP_VERSION.zip" "$SP_VERSION/*.tsv" "$SP_VERSION/*.bin" # Create a GitHub release and upload the zip file - name: Upload or Update Release From fcafc7661905963efafae69c30ea0baef390d014 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Fri, 9 Aug 2024 13:51:04 +0200 Subject: [PATCH 08/27] Remove recursive argument from zip --- .github/workflows/build_index.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 25c0b76..016e3b5 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -67,7 +67,7 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - cd build && zip -r "index_$SP_VERSION.zip" "$SP_VERSION/*.tsv" "$SP_VERSION/*.bin" + cd build && zip "index_$SP_VERSION.zip" "$SP_VERSION/*.tsv" "$SP_VERSION/*.bin" # Create a GitHub release and upload the zip file - name: Upload or Update Release From 04cc0ac4cca4234a7d683f25e5c54c8f94a1ae26 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Fri, 9 Aug 2024 13:55:40 +0200 Subject: [PATCH 09/27] Update build_index.yml --- .github/workflows/build_index.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 016e3b5..7135bb7 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -67,7 +67,7 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - cd build && zip "index_$SP_VERSION.zip" "$SP_VERSION/*.tsv" "$SP_VERSION/*.bin" + cd build && zip "index_$SP_VERSION.zip" "$SP_VERSION/proteins.tsv" "$SP_VERSION/taxons.tsv" "$SP_VERSION/sa_sparse3_compressed.bin" # Create a GitHub release and upload the zip file - name: Upload or Update Release From 64ef7fe1a4ab9d0b97232ff7226fdfb128bec51a Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Fri, 9 Aug 2024 14:11:32 +0200 Subject: [PATCH 10/27] Finally fix redundant folder in zip --- .github/workflows/build_index.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 7135bb7..69503d7 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -67,14 +67,14 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - cd build && zip "index_$SP_VERSION.zip" "$SP_VERSION/proteins.tsv" "$SP_VERSION/taxons.tsv" "$SP_VERSION/sa_sparse3_compressed.bin" + cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin" # Create a GitHub release and upload the zip file - name: Upload or Update Release id: upload_or_update_release uses: softprops/action-gh-release@v1 with: - files: build/index_${{ env.SP_VERSION }}.zip + files: build/${{ env.SP_VERSION }}/index_${{ env.SP_VERSION }}.zip tag_name: index-${{ env.SP_VERSION }} name: Index ${{ env.SP_VERSION }} commitish: ${{ github.sha }} From a774d7e777be595359e5b6f29aba62accfcf7da1 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Mon, 12 Aug 2024 10:53:11 +0200 Subject: [PATCH 11/27] Also add the raw uniprot_entries file to the release zip --- .github/workflows/build_index.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 69503d7..6573f60 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -57,6 +57,7 @@ jobs: mkdir -p build/$SP_VERSION lz4 -d build/input/uniprot_entries.tsv.lz4 | cut -f2,4,7,8 > build/$SP_VERSION/proteins.tsv lz4 -d build/input/taxons.tsv.lz4 > build/$SP_VERSION/taxons.tsv + mv build/input/uniprot_entries.tsv.lz4 build/$SP_VERSION/uniprot_entries.tsv.lz4 # Step 8: Run the sa-builder command - name: Run sa-builder From 29c53571c127522f15c1e8c12680c0313d0fd6e7 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Mon, 12 Aug 2024 10:59:56 +0200 Subject: [PATCH 12/27] Forgot to also update the ZIP command --- .github/workflows/build_index.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 6573f60..f0a54fd 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -68,7 +68,7 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin" + cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin" "uniprot_entries.tsv.lz4" # Create a GitHub release and upload the zip file - name: Upload or Update Release From e0621895333a1b5509f6a2e3bcbf65d6c1fbb5c8 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Mon, 12 Aug 2024 13:10:46 +0200 Subject: [PATCH 13/27] Remove uniprot_entries.tsv.lz4 again... --- .github/workflows/build_index.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index f0a54fd..69503d7 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -57,7 +57,6 @@ jobs: mkdir -p build/$SP_VERSION lz4 -d build/input/uniprot_entries.tsv.lz4 | cut -f2,4,7,8 > build/$SP_VERSION/proteins.tsv lz4 -d build/input/taxons.tsv.lz4 > build/$SP_VERSION/taxons.tsv - mv build/input/uniprot_entries.tsv.lz4 build/$SP_VERSION/uniprot_entries.tsv.lz4 # Step 8: Run the sa-builder command - name: Run sa-builder @@ -68,7 +67,7 @@ jobs: # Zip the contents of the build/$SP_VERSION directory - name: Zip build contents run: | - cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin" "uniprot_entries.tsv.lz4" + cd "build/$SP_VERSION" && zip "index_$SP_VERSION.zip" "proteins.tsv" "taxons.tsv" "sa_sparse3_compressed.bin" # Create a GitHub release and upload the zip file - name: Upload or Update Release From 7d6586b8218b232988f755aefb4700c75586a45b Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Mon, 12 Aug 2024 14:12:54 +0200 Subject: [PATCH 14/27] Update zip structure according to database --- .github/workflows/build_index.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml index 69503d7..2b12566 100644 --- a/.github/workflows/build_index.yml +++ b/.github/workflows/build_index.yml @@ -47,9 +47,7 @@ jobs: # Extract the contents of the output folder from the zip into a folder "build/input" - name: Extract zip contents run: | - unzip build/suffix-array.zip 'output/*' -d build/temp - mv build/temp/output/* build/input/ - rm -r build/temp + unzip build/suffix-array.zip '*' -d build/input # Make a directory with the SP_VERSION and process files - name: Process files From fb6e77a2f87f52aeb0b035c0579768b72200788f Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Wed, 11 Sep 2024 08:50:04 +0200 Subject: [PATCH 15/27] Represent chars in protein text with 5 bits, tests don't work yet --- Cargo.lock | 10 + bitarray/src/binary.rs | 8 +- bitarray/src/lib.rs | 18 +- sa-index/Cargo.toml | 1 + sa-index/src/lib.rs | 10 +- sa-index/src/sa_searcher.rs | 95 +++--- sa-index/src/suffix_to_protein_index.rs | 14 +- sa-mappings/Cargo.toml | 2 + sa-mappings/src/proteins.rs | 35 +-- text-compression/Cargo.toml | 9 + text-compression/src/lib.rs | 391 ++++++++++++++++++++++++ 11 files changed, 506 insertions(+), 87 deletions(-) create mode 100644 text-compression/Cargo.toml create mode 100644 text-compression/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index c29abc3..9d81263 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1127,15 +1127,18 @@ dependencies = [ "serde", "serde_json", "tempdir", + "text-compression", ] [[package]] name = "sa-mappings" version = "0.1.0" dependencies = [ + "bitarray", "bytelines", "fa-compression", "tempdir", + "text-compression", ] [[package]] @@ -1275,6 +1278,13 @@ dependencies = [ "remove_dir_all", ] +[[package]] +name = "text-compression" +version = "0.1.0" +dependencies = [ + "bitarray", +] + [[package]] name = "tinytemplate" version = "1.2.1" diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index e7265cd..a8084d1 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -159,10 +159,10 @@ mod tests { #[test] fn test_write_binary() { let mut bitarray = BitArray::with_capacity(4, 40); - bitarray.set(0, 0x1234567890); - bitarray.set(1, 0xabcdef0123); - bitarray.set(2, 0x4567890abc); - bitarray.set(3, 0xdef0123456); + bitarray.set(0, 0x1234567890_u64); + bitarray.set(1, 0xabcdef0123_u64); + bitarray.set(2, 0x4567890abc_u64); + bitarray.set(3, 0xdef0123456_u64); let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 655d17e..fe7b532 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -19,7 +19,7 @@ pub struct BitArray { /// The length of the bit array. len: usize, /// The number of bits in a single element of the data vector. - bits_per_value: usize + bits_per_value: usize, } impl BitArray { @@ -39,7 +39,7 @@ impl BitArray { data: vec![0; capacity * bits_per_value / 64 + extra], mask: (1 << bits_per_value) - 1, len: capacity, - bits_per_value + bits_per_value, } } @@ -85,6 +85,7 @@ impl BitArray { /// * `index` - The index of the value to set. /// * `value` - The value to set at the specified index. pub fn set(&mut self, index: usize, value: u64) { + let value: u64 = value.into(); let start_block = index * self.bits_per_value / 64; let start_block_offset = index * self.bits_per_value % 64; @@ -142,6 +143,11 @@ impl BitArray { pub fn clear(&mut self) { self.data.iter_mut().for_each(|x| *x = 0); } + + pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] { + &self.data[start_slice..end_slice] + } + } /// Writes the data to a writer in a binary format using a bit array. This function is helpfull @@ -257,10 +263,10 @@ mod tests { fn test_bitarray_set() { let mut bitarray = BitArray::with_capacity(4, 40); - bitarray.set(0, 0b0001110011111010110001000111111100110010); - bitarray.set(1, 0b1100001001010010011000010100110111001001); - bitarray.set(2, 0b1111001101001101101101101011101001010001); - bitarray.set(3, 0b0000100010010001010001001110101110011100); + bitarray.set(0, 0b0001110011111010110001000111111100110010_u64); + bitarray.set(1, 0b1100001001010010011000010100110111001001_u64); + bitarray.set(2, 0b1111001101001101101101101011101001010001_u64); + bitarray.set(3, 0b0000100010010001010001001110101110011100_u64); assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]); } diff --git a/sa-index/Cargo.toml b/sa-index/Cargo.toml index de57fc9..25dda76 100644 --- a/sa-index/Cargo.toml +++ b/sa-index/Cargo.toml @@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] } rayon = "1.8.1" serde = { version = "1.0.197", features = ["derive"] } sa-mappings = { path = "../sa-mappings" } +text-compression = { path = "../text-compression" } bitarray = { path = "../bitarray" } serde_json = "1.0.116" diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index f276906..53f5348 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -115,11 +115,11 @@ mod tests { #[test] fn test_suffix_array_compressed() { let mut bitarray = BitArray::with_capacity(5, 40); - bitarray.set(0, 1); - bitarray.set(1, 2); - bitarray.set(2, 3); - bitarray.set(3, 4); - bitarray.set(4, 5); + bitarray.set(0, 1 as u64); + bitarray.set(1, 2 as u64); + bitarray.set(2, 3 as u64); + bitarray.set(3, 4 as u64); + bitarray.set(4, 5 as u64); let sa = SuffixArray::Compressed(bitarray, 1); assert_eq!(sa.len(), 5); diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index d09c704..7f60cbb 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -1,6 +1,7 @@ use std::{cmp::min, ops::Deref}; use sa_mappings::proteins::{Protein, Proteins}; +use text_compression::ProteinTextSlice; use crate::{ sa_searcher::BoundSearch::{Maximum, Minimum}, @@ -75,7 +76,7 @@ pub struct SparseSearcher(Searcher); impl SparseSearcher { pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); Self(searcher) } @@ -93,7 +94,7 @@ pub struct DenseSearcher(Searcher); impl DenseSearcher { pub fn new(sa: SuffixArray, proteins: Proteins) -> Self { - let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = DenseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); Self(searcher) } @@ -176,12 +177,12 @@ impl Searcher { // match as long as possible while index_in_search_string < search_string.len() - && index_in_suffix < self.proteins.input_string.len() - && (search_string[index_in_search_string] == self.proteins.input_string[index_in_suffix] + && index_in_suffix < self.proteins.text.len() + && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix) as u8 || (search_string[index_in_search_string] == b'L' - && self.proteins.input_string[index_in_suffix] == b'I') + && self.proteins.text.get(index_in_suffix) as u8 == b'I') || (search_string[index_in_search_string] == b'I' - && self.proteins.input_string[index_in_suffix] == b'L')) + && self.proteins.text.get(index_in_suffix) as u8 == b'L')) { index_in_suffix += 1; index_in_search_string += 1; @@ -191,7 +192,7 @@ impl Searcher { if !search_string.is_empty() { if index_in_search_string == search_string.len() { is_cond_or_equal = true - } else if index_in_suffix < self.proteins.input_string.len() { + } else if index_in_suffix < self.proteins.text.len() { // in our index every L was replaced by a I, so we need to replace them if we want // to search in the right direction let peptide_char = if search_string[index_in_search_string] == b'L' { @@ -200,10 +201,10 @@ impl Searcher { search_string[index_in_search_string] }; - let protein_char = if self.proteins.input_string[index_in_suffix] == b'L' { + let protein_char = if self.proteins.text.get(index_in_suffix) as u8 == b'L' { b'I' } else { - self.proteins.input_string[index_in_suffix] + self.proteins.text.get(index_in_suffix) as u8 }; is_cond_or_equal = condition_check(peptide_char, protein_char); @@ -340,16 +341,14 @@ impl Searcher { // check at all if suffix >= skip && ((skip == 0 - || Self::check_prefix( - current_search_string_prefix, - &self.proteins.input_string[suffix - skip..suffix], - equate_il - )) - && Self::check_suffix( + || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix) + .equals_slice(current_search_string_prefix, equate_il)) + && + Self::check_suffix( skip, il_locations_current_suffix, current_search_string_suffix, - &self.proteins.input_string[suffix..suffix + search_string.len() - skip], + ProteinTextSlice::new(&self.proteins.text, suffix, suffix + search_string.len() - skip), equate_il )) { @@ -419,19 +418,13 @@ impl Searcher { skip: usize, il_locations: &[usize], search_string: &[u8], - index_string: &[u8], + text_slice: ProteinTextSlice, equate_il: bool ) -> bool { if equate_il { true } else { - for &il_location in il_locations { - let index = il_location - skip; - if search_string[index] != index_string[index] { - return false; - } - } - true + text_slice.check_il_locations(skip, il_locations, search_string) } } @@ -459,6 +452,7 @@ impl Searcher { #[cfg(test)] mod tests { use sa_mappings::proteins::{Protein, Proteins}; + use text_compression::ProteinText; use crate::{ sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, @@ -487,9 +481,11 @@ mod tests { } fn get_example_proteins() -> Proteins { - let text = "AI-BLACVAA-AC-KCRLZ$".to_string().into_bytes(); + let input_string = "AI-BLACVAA-AC-KCRLZ$"; + let text = ProteinText::from_string(input_string); + Proteins { - input_string: text, + text, proteins: vec![ Protein { uniprot_id: String::new(), @@ -520,7 +516,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'A' @@ -541,7 +537,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search suffix 'VAA' @@ -558,7 +554,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![19, 10, 2, 13, 9, 8, 11, 5, 0, 3, 12, 15, 6, 1, 4, 17, 14, 16, 7, 18], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); let bounds_res = searcher.search_bounds(&[b'I']); @@ -574,7 +570,7 @@ mod tests { let proteins = get_example_proteins(); let sa = SuffixArray::Original(vec![9, 0, 3, 12, 15, 6, 18], 3); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'RIZ' with equal I and L @@ -589,10 +585,11 @@ mod tests { // test edge case where an I or L is the first index in the sparse SA. #[test] fn test_l_first_index_in_sa() { - let text = "LMOXZ$".to_string().into_bytes(); + let input_string = "LMOXZ$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -601,7 +598,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L @@ -611,10 +608,11 @@ mod tests { #[test] fn test_il_missing_matches() { - let text = "AAILLL$".to_string().into_bytes(); + let input_string = "AAILLL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -623,7 +621,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I'], usize::MAX, true); @@ -632,19 +630,20 @@ mod tests { #[test] fn test_il_duplication() { - let text = "IIIILL$".to_string().into_bytes(); + let input_string = "IIIILL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, functional_annotations: vec![] }] }; - + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); let found_suffixes = searcher.search_matching_suffixes(&[b'I', b'I'], usize::MAX, true); @@ -653,10 +652,11 @@ mod tests { #[test] fn test_il_suffix_check() { - let text = "IIIILL$".to_string().into_bytes(); - + let input_string = "IIIILL$"; + let text = ProteinText::from_string(input_string); + let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -665,7 +665,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search all places where II is in the string IIIILL, but with a sparse SA @@ -676,10 +676,11 @@ mod tests { #[test] fn test_il_duplication2() { - let text = "IILLLL$".to_string().into_bytes(); + let input_string = "IILLLL$"; + let text = ProteinText::from_string(input_string); let proteins = Proteins { - input_string: text, + text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, @@ -688,7 +689,7 @@ mod tests { }; let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); - let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.input_string); + let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'IM' with equal I and L diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 121b569..6aed362 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -2,6 +2,7 @@ use clap::ValueEnum; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; use crate::Nullable; +use text_compression::ProteinText; /// Enum used to define the commandline arguments and choose which index style is used #[derive(ValueEnum, Clone, Debug, PartialEq)] @@ -66,10 +67,10 @@ impl DenseSuffixToProtein { /// # Returns /// /// Returns a new DenseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { + pub fn new(text: &ProteinText) -> Self { let mut current_protein_index: u32 = 0; let mut suffix_index_to_protein: Vec = vec![]; - for &char in text.iter() { + for char in text.iter() { if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { current_protein_index += 1; suffix_index_to_protein.push(u32::NULL); @@ -92,9 +93,9 @@ impl SparseSuffixToProtein { /// # Returns /// /// Returns a new SparseSuffixToProtein build over the provided text - pub fn new(text: &[u8]) -> Self { + pub fn new(text: &ProteinText) -> Self { let mut suffix_index_to_protein: Vec = vec![0]; - for (index, &char) in text.iter().enumerate() { + for (index, char) in text.iter().enumerate() { if char == SEPARATION_CHARACTER || char == TERMINATION_CHARACTER { suffix_index_to_protein.push(index as i64 + 1); } @@ -108,6 +109,7 @@ impl SparseSuffixToProtein { mod tests { use clap::ValueEnum; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; + use text_compression::ProteinText; use crate::{ suffix_to_protein_index::{ @@ -116,10 +118,10 @@ mod tests { Nullable }; - fn build_text() -> Vec { + fn build_text() -> ProteinText { let mut text = ["ACG", "CG", "AAA"].join(&format!("{}", SEPARATION_CHARACTER as char)); text.push(TERMINATION_CHARACTER as char); - text.into_bytes() + ProteinText::from_string(&text) } #[test] diff --git a/sa-mappings/Cargo.toml b/sa-mappings/Cargo.toml index b20a2bf..d255f7c 100644 --- a/sa-mappings/Cargo.toml +++ b/sa-mappings/Cargo.toml @@ -11,3 +11,5 @@ tempdir = "0.3.7" [dependencies] fa-compression = { path = "../fa-compression" } bytelines = "2.5.0" +bitarray = { path = "../bitarray" } +text-compression = { path = "../text-compression" } diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index f2b24cc..ca3bdd7 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -5,6 +5,7 @@ use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8}; use bytelines::ByteLines; use fa_compression::algorithm1::{decode, encode}; +use text_compression::ProteinText; /// The separation character used in the input string pub static SEPARATION_CHARACTER: u8 = b'-'; @@ -28,7 +29,7 @@ pub struct Protein { /// A struct that represents a collection of proteins pub struct Proteins { /// The input string containing all proteins - pub input_string: Vec, + pub text: ProteinText, /// The proteins in the input string pub proteins: Vec @@ -86,12 +87,13 @@ impl Proteins { input_string.pop(); input_string.push(TERMINATION_CHARACTER.into()); - input_string.shrink_to_fit(); proteins.shrink_to_fit(); - Ok(Self { input_string: input_string.into_bytes(), proteins }) + + let text = ProteinText::from_string(&input_string); + Ok(Self { text, proteins }) } - /// Creates a `vec` which represents all the proteins concatenated from the database file + /// Creates a `ProteinText` which represents all the proteins concatenated from the database file /// /// # Arguments /// * `file` - The path to the database file @@ -99,12 +101,12 @@ impl Proteins { /// /// # Returns /// - /// Returns a `Result` containing the `Vec` + /// Returns a `Result` containing the `ProteinText` /// /// # Errors /// /// Returns a `Box` if an error occurred while reading the database file - pub fn try_from_database_file_without_annotations(database_file: &str) -> Result, Box> { + pub fn try_from_database_file_without_annotations(database_file: &str) -> Result> { let mut input_string: String = String::new(); let file = File::open(database_file)?; @@ -123,11 +125,10 @@ impl Proteins { input_string.push(SEPARATION_CHARACTER.into()); } - input_string.pop(); - input_string.push(TERMINATION_CHARACTER.into()); + let text = ProteinText::from_string(&input_string); + + Ok(text) - input_string.shrink_to_fit(); - Ok(input_string.into_bytes()) } } @@ -181,8 +182,10 @@ mod tests { #[test] fn test_new_proteins() { + let input_string = "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG"; + let text = ProteinText::from_string(&input_string); let proteins = Proteins { - input_string: "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes().to_vec(), + text, proteins: vec![ Protein { uniprot_id: "P12345".to_string(), @@ -197,7 +200,6 @@ mod tests { ] }; - assert_eq!(proteins.input_string, "MLPGLALLLLAAWTARALEV-PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG".as_bytes()); assert_eq!(proteins.proteins.len(), 2); assert_eq!(proteins[0].uniprot_id, "P12345"); assert_eq!(proteins[0].taxon_id, 1); @@ -245,12 +247,7 @@ mod tests { let proteins = Proteins::try_from_database_file_without_annotations(database_file.to_str().unwrap()).unwrap(); - let sep_char = SEPARATION_CHARACTER as char; - let end_char = TERMINATION_CHARACTER as char; - let expected = format!( - "MLPGLALLLLAAWTARALEV{}PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG{}KWDSDPSGTKTCIDT{}KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH{}", - sep_char, sep_char, sep_char, end_char - ); - assert_eq!(proteins, expected.as_bytes()); + let expected = 'L' as u8; + assert_eq!(proteins.get(4), expected); } } diff --git a/text-compression/Cargo.toml b/text-compression/Cargo.toml new file mode 100644 index 0000000..c312a3c --- /dev/null +++ b/text-compression/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "text-compression" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitarray = { path = "../bitarray" } diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs new file mode 100644 index 0000000..60b2463 --- /dev/null +++ b/text-compression/src/lib.rs @@ -0,0 +1,391 @@ +use std::{ + error::Error, + io::{BufRead, Write} +}; +use std::collections::HashMap; + +use bitarray::{data_to_writer, Binary, BitArray}; + +pub struct ProteinText { + bit_array: BitArray, + char_to_5bit: HashMap, + bit5_to_char: Vec, +} + +impl ProteinText { + + fn create_char_to_5bit_hashmap() -> HashMap { + let mut hashmap = HashMap::::new(); + for (i, c) in "ACDEFGHIKLMNPQRSTVWY-".chars().enumerate() { + hashmap.insert(c as u8, i as u8); + } + + hashmap + } + + fn create_bit5_to_char() -> Vec { + let mut vec = Vec::::new(); + for c in "ACDEFGHIKLMNPQRSTVWY-".chars() { + vec.push(c as u8); + } + vec + } + + pub fn from_string(input_string: &str) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + let mut bit_array = BitArray::with_capacity(input_string.len(), 5); + for (i, c) in input_string.chars().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + Self { bit_array, char_to_5bit, bit5_to_char } + } + + pub fn from_vec(input_vec: &Vec) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + let mut bit_array = BitArray::with_capacity(input_vec.len(), 5); + for (i, e) in input_vec.iter().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(e).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + Self { bit_array, char_to_5bit, bit5_to_char } + } + + pub fn new(bit_array: BitArray) -> ProteinText { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + Self { bit_array, char_to_5bit, bit5_to_char } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self::new(BitArray::with_capacity(capacity, 5)) + } + + pub fn get(&self, index: usize) -> u8 { + let char_5bit = self.bit_array.get(index) as usize; + self.bit5_to_char[char_5bit] + } + + pub fn set(&mut self, index: usize, value: u8) { + let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet"); + self.bit_array.set(index, char_5bit as u64); + } + + pub fn len(&self) -> usize { + self.bit_array.len() + } + + pub fn is_empty(&self) -> bool { + self.bit_array.len() == 0 + } + + /// Clears the `BitArray`, setting all bits to 0. + pub fn clear(&mut self) { + self.bit_array.clear() + } + + pub fn iter(&self) -> ProteinTextIterator { + ProteinTextIterator {protein_text: self, index: 0, } + } + +} + +pub struct ProteinTextSlice<'a> { + text: &'a ProteinText, + start: usize, // included + end: usize, // excluded +} + +impl<'a> ProteinTextSlice<'a> { + + pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice { + Self {text, start, end } + } + + pub fn get(&self, index: usize) -> u8 { + self.text.get(self.start + index) + } + + pub fn len(&self) -> usize { + self.end - self.start + } + + #[inline] + pub fn equals_slice(&self, other: &[u8], equate_il: bool) -> bool { + if equate_il { + other.iter().zip(self.iter()).all(|(&search_character, text_character)| { + search_character == text_character + || (search_character == b'I' && text_character == b'L') + || (search_character == b'L' && text_character == b'I') + }) + } else { + other.iter().zip(self.iter()).all(|(&search_character, text_character)| search_character == text_character) + } + } + + pub fn check_il_locations( + &self, + skip: usize, + il_locations: &[usize], + search_string: &[u8], + ) -> bool { + for &il_location in il_locations { + let index = il_location - skip; + if search_string[index] != self.get(index) { + return false; + } + } + true + } + + pub fn iter(&self) -> ProteinTextSliceIterator { + ProteinTextSliceIterator {text_slice: self, index: 0, } + } +} + +pub struct ProteinTextIterator<'a> { + protein_text: &'a ProteinText, + index: usize, +} + +pub struct ProteinTextSliceIterator<'a> { + text_slice: &'a ProteinTextSlice<'a>, + index: usize, +} + +impl<'a> Iterator for ProteinTextSliceIterator<'a> { + + type Item = u8; + + fn next(&mut self) -> Option { + if self.index >= self.text_slice.len() { + return None; + } + + self.index += 1; + Some(self.text_slice.get(self.index - 1)) + } +} + +impl<'a> Iterator for ProteinTextIterator<'a> { + + type Item = u8; + + fn next(&mut self) -> Option { + if self.index >= self.protein_text.len() { + return None; + } + + self.index += 1; + Some(self.protein_text.get(self.index - 1)) + } +} + +/// Writes the compressed text to a writer. +/// +/// # Arguments +/// +/// * `text` - The text to be compressed. +/// * `writer` - The writer to which the compressed text will be written. +/// +/// # Errors +/// +/// Returns an error if writing to the writer fails. +pub fn dump_compressed_text( + text: Vec, + writer: &mut impl Write +) -> Result<(), Box> { + let bits_per_value = 5; + + // Write the flags to the writer + // 00000001 indicates that the text is compressed + writer + .write(&[bits_per_value as u8]) + .map_err(|_| "Could not write the required bits to the writer")?; + + // Write the size of the text to the writer + writer + .write(&(text.len() as u64).to_le_bytes()) + .map_err(|_| "Could not write the size of the text to the writer")?; + + // Compress the text and write it to the writer + let text_writer: Vec = text.iter().map(|item| ::from(*item)).collect(); + data_to_writer(text_writer, bits_per_value, 8 * 1024, writer) + .map_err(|_| "Could not write the compressed text to the writer")?; + + Ok(()) +} + +/// Load the compressed text from a reader. +/// +/// # Arguments +/// +/// * `reader` - The reader from which the compressed text will be read. +/// +/// # Errors +/// +/// Returns an error if reading from the reader fails. +pub fn load_compressed_text( + reader: &mut impl BufRead +) -> Result> { + let bits_per_value: usize = 5; + // Read the size of the text from the binary file (8 bytes) + let mut size_buffer = [0_u8; 8]; + reader + .read_exact(&mut size_buffer) + .map_err(|_| "Could not read the size of the text from the binary file")?; + let size = u64::from_le_bytes(size_buffer) as usize; + + // Read the compressed text from the binary file + let mut compressed_text = BitArray::with_capacity(size, bits_per_value); + compressed_text + .read_binary(reader) + .map_err(|_| "Could not read the compressed text from the binary file")?; + + Ok(ProteinText::new(compressed_text)) +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + + pub struct FailingWriter { + /// The number of times the write function can be called before it fails. + pub valid_write_count: usize + } + + impl Write for FailingWriter { + fn write(&mut self, _: &[u8]) -> Result { + if self.valid_write_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Write failed")); + } + + self.valid_write_count -= 1; + Ok(1) + } + + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + + pub struct FailingReader { + /// The number of times the read function can be called before it fails. + pub valid_read_count: usize + } + + impl Read for FailingReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.valid_read_count == 0 { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Read failed")); + } + + self.valid_read_count -= 1; + Ok(buf.len()) + } + } + + impl BufRead for FailingReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + Ok(&[]) + } + + fn consume(&mut self, _: usize) {} + } + + #[test] + fn test_dump_compressed_text() { + let text: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + + let mut writer = vec![]; + dump_compressed_text(text, &mut writer).unwrap(); + + assert_eq!(writer, vec![ + // bits per value + 5, // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8 + ]); + } + + #[test] + #[should_panic(expected = "Could not write the required bits to the writer")] + fn test_dump_compressed_text_fail_required_bits() { + let mut writer = FailingWriter { valid_write_count: 0 }; + + dump_compressed_text(vec![], &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the size of the text to the writer")] + fn test_dump_compressed_text_fail_size() { + let mut writer = FailingWriter { valid_write_count: 1 }; + + dump_compressed_text(vec![], &mut writer).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not write the compressed text to the writer")] + fn test_dump_compressed_text_fail_compressed_text() { + let mut writer = FailingWriter { valid_write_count: 3 }; + + dump_compressed_text(vec![1], &mut writer).unwrap(); + } + + #[test] + fn test_load_compressed_text() { + let data = vec![ + // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8 + ]; + + let mut reader = std::io::BufReader::new(&data[..]); + let compressed_text = load_compressed_text(&mut reader).unwrap(); + + for i in 0..10 { + assert_eq!(compressed_text.get(i), i as u8 + 1); + } + } + + #[test] + #[should_panic(expected = "Could not read the size of the text from the binary file")] + fn test_load_compressed_text_fail_size() { + let mut reader = FailingReader { valid_read_count: 0 }; + + load_compressed_text(&mut reader).unwrap(); + } + + #[test] + #[should_panic(expected = "Could not read the compressed text from the binary file")] + fn test_load_compressed_text_fail_compressed_text() { + let mut reader = FailingReader { valid_read_count: 2 }; + + load_compressed_text(&mut reader).unwrap(); + } + + #[test] + fn test_failing_writer() { + let mut writer = FailingWriter { valid_write_count: 0 }; + assert!(writer.flush().is_ok()); + assert!(writer.write(&[0]).is_err()); + } + + #[test] + fn test_failing_reader() { + let mut reader = FailingReader { valid_read_count: 0 }; + let right_buffer: [u8; 0] = []; + assert_eq!(reader.fill_buf().unwrap(), &right_buffer); + assert_eq!(reader.consume(0), ()); + let mut buffer = [0_u8; 1]; + assert!(reader.read(&mut buffer).is_err()); + } +} From b0a804d1eb09feb19c6039c7610192f7ef5a5613 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Wed, 11 Sep 2024 09:54:33 +0200 Subject: [PATCH 16/27] fix tests to only use characters of peptide alphabet --- sa-index/src/sa_searcher.rs | 10 +++++----- text-compression/src/lib.rs | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 7f60cbb..2324046 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -481,7 +481,7 @@ mod tests { } fn get_example_proteins() -> Proteins { - let input_string = "AI-BLACVAA-AC-KCRLZ$"; + let input_string = "AI-CLACVAA-AC-KCRLY$"; let text = ProteinText::from_string(input_string); Proteins { @@ -561,7 +561,7 @@ mod tests { assert_eq!(bounds_res, BoundSearchResult::SearchResult((13, 16))); // search bounds 'RIZ' with equal I and L - let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Z']); + let bounds_res = searcher.search_bounds(&[b'R', b'I', b'Y']); assert_eq!(bounds_res, BoundSearchResult::SearchResult((17, 18))); } @@ -574,18 +574,18 @@ mod tests { let searcher = Searcher::new(sa, proteins, Box::new(suffix_index_to_protein)); // search bounds 'RIZ' with equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, true); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, true); assert_eq!(found_suffixes, SearchAllSuffixesResult::SearchResult(vec![16])); // search bounds 'RIZ' without equal I and L - let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Z'], usize::MAX, false); + let found_suffixes = searcher.search_matching_suffixes(&[b'R', b'I', b'Y'], usize::MAX, false); assert_eq!(found_suffixes, SearchAllSuffixesResult::NoMatches); } // test edge case where an I or L is the first index in the sparse SA. #[test] fn test_l_first_index_in_sa() { - let input_string = "LMOXZ$"; + let input_string = "LMPYY$"; let text = ProteinText::from_string(input_string); let proteins = Proteins { diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index 60b2463..b090826 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -16,7 +16,7 @@ impl ProteinText { fn create_char_to_5bit_hashmap() -> HashMap { let mut hashmap = HashMap::::new(); - for (i, c) in "ACDEFGHIKLMNPQRSTVWY-".chars().enumerate() { + for (i, c) in "ACDEFGHIKLMNPQRSTVWY-$".chars().enumerate() { hashmap.insert(c as u8, i as u8); } @@ -25,7 +25,7 @@ impl ProteinText { fn create_bit5_to_char() -> Vec { let mut vec = Vec::::new(); - for c in "ACDEFGHIKLMNPQRSTVWY-".chars() { + for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() { vec.push(c as u8); } vec From e47646106fb329f514e39d3127d1c613bbf9c80a Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Wed, 11 Sep 2024 10:44:26 +0200 Subject: [PATCH 17/27] use uncompressed text for SA construction --- sa-builder/src/main.rs | 2 +- sa-index/src/sa_searcher.rs | 27 +----------------------- sa-mappings/src/proteins.rs | 41 +++++++++++++++++++++++++++++++++++-- text-compression/src/lib.rs | 4 ++-- 4 files changed, 43 insertions(+), 31 deletions(-) diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 98a1414..01cc3c4 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -21,7 +21,7 @@ fn main() { eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - let mut data = Proteins::try_from_database_file_without_annotations(&database_file) + let mut data = Proteins::try_from_database_file_uncompressed(&database_file) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!( "✅ Successfully loaded the proteins in {} seconds!", diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 2324046..4f4522e 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -342,7 +342,7 @@ impl Searcher { if suffix >= skip && ((skip == 0 || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix) - .equals_slice(current_search_string_prefix, equate_il)) + .equals_slice(current_search_string_prefix, equate_il)) // Check the prefix && Self::check_suffix( skip, @@ -372,31 +372,6 @@ impl Searcher { } } - /// Returns true of the prefixes are the same - /// if `equate_il` is set to true, L and I are considered the same - /// - /// # Arguments - /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched - /// * `index_prefix` - The unchecked prefix from the protein from the suffix array - /// * `equate_il` - True if we want to equate I and L during search, otherwise false - /// - /// # Returns - /// - /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise - /// false - #[inline] - fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool { - if equate_il { - search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| { - search_character == index_character - || (search_character == b'I' && index_character == b'L') - || (search_character == b'L' && index_character == b'I') - }) - } else { - search_string_prefix == index_prefix - } - } - /// Returns true of the search_string and index_string are equal /// This is automatically true if `equate_il` is set to true, since there matched during /// search where I = L If `equate_il` is set to false, we need to check if the I and diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index ca3bdd7..626ead3 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -47,7 +47,6 @@ impl Proteins { /// /// # Arguments /// * `file` - The path to the database file - /// * `taxon_aggregator` - The `TaxonAggregator` to use /// /// # Returns /// @@ -97,7 +96,6 @@ impl Proteins { /// /// # Arguments /// * `file` - The path to the database file - /// * `taxon_aggregator` - The `TaxonAggregator` to use /// /// # Returns /// @@ -130,6 +128,45 @@ impl Proteins { Ok(text) } + + /// Creates a `vec` which represents all the proteins concatenated from the database file + /// + /// # Arguments + /// * `file` - The path to the database file + /// + /// # Returns + /// + /// Returns a `Result` containing the `Vec` + /// + /// # Errors + /// + /// Returns a `Box` if an error occurred while reading the database file + pub fn try_from_database_file_uncompressed(database_file: &str) -> Result, Box> { + let mut input_string: String = String::new(); + + let file = File::open(database_file)?; + + // Read the lines as bytes, since the input string is not guaranteed to be utf8 + // because of the encoded functional annotations + let mut lines = ByteLines::new(BufReader::new(file)); + + while let Some(Ok(line)) = lines.next() { + let mut fields = line.split(|b| *b == b'\t'); + + // only get the taxon id and sequence from each line, we don't need the other parts + let sequence = from_utf8(fields.nth(2).unwrap())?; + + input_string.push_str(&sequence.to_uppercase()); + input_string.push(SEPARATION_CHARACTER.into()); + } + + input_string.pop(); + input_string.push(TERMINATION_CHARACTER.into()); + + input_string.shrink_to_fit(); + Ok(input_string.into_bytes()) + + } } impl Index for Proteins { diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index b090826..871de5b 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -351,8 +351,8 @@ mod tests { let mut reader = std::io::BufReader::new(&data[..]); let compressed_text = load_compressed_text(&mut reader).unwrap(); - for i in 0..10 { - assert_eq!(compressed_text.get(i), i as u8 + 1); + for (i, c) in "CDEFGHIKLM".chars().enumerate() { + assert_eq!(compressed_text.get(i), c as u8); } } From 2c14c52bfc7e6fc5476bdc6d7d698cd1630734c5 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Wed, 11 Sep 2024 11:46:59 +0200 Subject: [PATCH 18/27] add tests for text-compression --- text-compression/src/lib.rs | 108 +++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index 871de5b..2e25f7b 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -94,6 +94,10 @@ impl ProteinText { ProteinTextIterator {protein_text: self, index: 0, } } + pub fn slice(&self, start: usize, end:usize) -> ProteinTextSlice { + ProteinTextSlice::new(self, start, end) + } + } pub struct ProteinTextSlice<'a> { @@ -253,7 +257,7 @@ pub fn load_compressed_text( #[cfg(test)] mod tests { - use std::io::Read; + use std::{char, io::Read}; use super::*; @@ -301,6 +305,108 @@ mod tests { fn consume(&mut self, _: usize) {} } + #[test] + fn test_u8_5bit_conversion() { + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + let bit5_to_char = ProteinText::create_bit5_to_char(); + + for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() { + let char_5bit = char_to_5bit.get(&(c as u8)).unwrap(); + assert_eq!(c as u8, bit5_to_char[*char_5bit as usize]); + } + } + + #[test] + fn test_build_from_string() { + let text = ProteinText::from_string("ACACA-CAC$"); + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_build_from_vec() { + let vec = vec![b'A', b'C', b'A', b'C', b'A', b'-', b'C', b'A', b'C', b'$']; + let text = ProteinText::from_vec(&vec); + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_build_from_bitarray() { + let input_string = "ACACA-CAC$"; + let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); + + let mut bit_array = BitArray::with_capacity(input_string.len(), 5); + for (i, c) in input_string.chars().enumerate() { + let char_5bit: u8 = *char_to_5bit.get(&(c as u8)).expect("Input character not in alphabet"); + bit_array.set(i, char_5bit as u64); + } + + let text = ProteinText::new(bit_array); + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_build_with_capacity() { + let input_string = "ACACA-CAC$"; + + let mut text = ProteinText::with_capacity(input_string.len()); + for (i, c) in "ACACA-CAC$".chars().enumerate() { + text.set(i, c as u8); + } + + for (i, c) in "ACACA-CAC$".chars().enumerate() { + assert_eq!(c as u8, text.get(i)); + } + } + + #[test] + fn test_text_slice() { + let input_string = "ACACA-CAC$"; + let start = 1; + let end = 5; + let text = ProteinText::from_string(&input_string); + let text_slice = text.slice(start, end); + + for (i, c) in input_string[start..end].chars().enumerate() { + assert_eq!(c as u8, text_slice.get(i)); + } + } + + #[test] + fn test_equals_slice() { + let input_string = "ACICA-CAC$"; + let text = ProteinText::from_string(&input_string); + let text_slice = text.slice(1, 5); + let eq_slice_true = [b'C', b'I', b'C', b'A']; + let eq_slice_false = [b'C', b'C', b'C', b'A']; + let eq_slice_il_true = [b'C', b'L', b'C', b'A']; + + assert!(text_slice.equals_slice(&eq_slice_true, false)); + assert!(! text_slice.equals_slice(&eq_slice_false, false)); + assert!(text_slice.equals_slice(&eq_slice_il_true, true)); + } + + #[test] + fn test_check_il_locations() { + let input_string = "ACILA-CAC$"; + let text = ProteinText::from_string(&input_string); + let text_slice = text.slice(1, 5); + let il_locations = [1, 2]; + let il_true = [b'C', b'I', b'L', b'A']; + let il_false = [b'C', b'I', b'C', b'A']; + + assert!(text_slice.check_il_locations(0, &il_locations, &il_true)); + assert!(! text_slice.check_il_locations(0, &il_locations, &il_false)); + } + #[test] fn test_dump_compressed_text() { let text: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; From 224fa5185adc06694d102ebd34592e85d3f9c26a Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Wed, 11 Sep 2024 13:59:51 +0200 Subject: [PATCH 19/27] Add documentation to text compression --- text-compression/src/lib.rs | 145 +++++++++++++++++++++++++++++++++++- 1 file changed, 144 insertions(+), 1 deletion(-) diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index 2e25f7b..6bfaf1a 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -6,14 +6,23 @@ use std::collections::HashMap; use bitarray::{data_to_writer, Binary, BitArray}; +/// Structure representing the proteins, stored in a bit array using 5 bits per amino acid. pub struct ProteinText { + /// Bit array holding the sequence of amino acids bit_array: BitArray, + /// Hashmap storing the mapping between the character as `u8` and a 5 bit number. char_to_5bit: HashMap, + /// Vector storing the mapping between the 5 bit number and the character as `u8`. bit5_to_char: Vec, } impl ProteinText { + /// Creates the hashmap storing the mappings between the characters as `u8` and 5 bit numbers. + /// + /// # Returns + /// + /// Returns the hashmap fn create_char_to_5bit_hashmap() -> HashMap { let mut hashmap = HashMap::::new(); for (i, c) in "ACDEFGHIKLMNPQRSTVWY-$".chars().enumerate() { @@ -23,6 +32,11 @@ impl ProteinText { hashmap } + /// Creates the vector storing the mappings between the 5 bit numbers and the characters as `u8`. + /// + /// # Returns + /// + /// Returns the vector fn create_bit5_to_char() -> Vec { let mut vec = Vec::::new(); for c in "ACDEFGHIKLMNPQRSTVWY-$".chars() { @@ -31,6 +45,14 @@ impl ProteinText { vec } + /// Creates the compressed text from a string. + /// + /// # Arguments + /// * `input_string` - The text (proteins) in string format + /// + /// # Returns + /// + /// An instance of `ProteinText` pub fn from_string(input_string: &str) -> ProteinText { let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); let bit5_to_char = ProteinText::create_bit5_to_char(); @@ -44,6 +66,14 @@ impl ProteinText { Self { bit_array, char_to_5bit, bit5_to_char } } + /// Creates the compressed text from a vector. + /// + /// # Arguments + /// * `input_vec` - The text (proteins) in a vector with elements of type `u8` representing the amino acids. + /// + /// # Returns + /// + /// An instance of `ProteinText` pub fn from_vec(input_vec: &Vec) -> ProteinText { let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); let bit5_to_char = ProteinText::create_bit5_to_char(); @@ -57,30 +87,69 @@ impl ProteinText { Self { bit_array, char_to_5bit, bit5_to_char } } + /// Creates the compressed text from a bit array. + /// + /// # Arguments + /// * `bit_array` - The text (proteins) in a bit array using 5 bits for each amino acid. + /// + /// # Returns + /// + /// An instance of `ProteinText` pub fn new(bit_array: BitArray) -> ProteinText { let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); let bit5_to_char = ProteinText::create_bit5_to_char(); Self { bit_array, char_to_5bit, bit5_to_char } } + /// Creates an instance of `ProteinText` with a given capacity. + /// + /// # Arguments + /// * `capacity` - The amount of characters in the text. + /// + /// # Returns + /// + /// An instance of `ProteinText` pub fn with_capacity(capacity: usize) -> Self { Self::new(BitArray::with_capacity(capacity, 5)) } + /// Search the character at a given position in the compressed text. + /// + /// # Arguments + /// * `index` - The index of the character to search. + /// + /// # Returns + /// + /// the character at position `index` as `u8`. pub fn get(&self, index: usize) -> u8 { let char_5bit = self.bit_array.get(index) as usize; self.bit5_to_char[char_5bit] } + /// Set the character at a given index. + /// + /// # Arguments + /// * `index` - The index of the character to change. + /// * `value` - The character to fill in as `u8`. pub fn set(&mut self, index: usize, value: u8) { let char_5bit: u8 = *self.char_to_5bit.get(&value).expect("Input character not in alphabet"); self.bit_array.set(index, char_5bit as u64); } + /// Queries the length of the text. + /// + /// # Returns + /// + /// the length of the text pub fn len(&self) -> usize { self.bit_array.len() } + /// Check if the text is empty (length 0). + /// + /// # Returns + /// + /// true if the the text has length 0, false otherwise. pub fn is_empty(&self) -> bool { self.bit_array.len() == 0 } @@ -90,36 +159,83 @@ impl ProteinText { self.bit_array.clear() } + /// Get an iterator over the characters of the text. + /// + /// # Returns + /// + /// A `ProteinTextIterator`, which can iterate over the characters of the text. pub fn iter(&self) -> ProteinTextIterator { ProteinTextIterator {protein_text: self, index: 0, } } + /// Get a slice of the text + /// + /// # Returns + /// + /// An `ProteinTextSlice` representing a slice of the text. pub fn slice(&self, start: usize, end:usize) -> ProteinTextSlice { ProteinTextSlice::new(self, start, end) } } +/// Structure representing a slice of a `ProteinText`. pub struct ProteinTextSlice<'a> { + /// The `Proteintext` of whih to take a slice. text: &'a ProteinText, + /// The start of the slice. start: usize, // included + /// The end of the slice. end: usize, // excluded } impl<'a> ProteinTextSlice<'a> { + /// Creates an instance of `ProteintextSlice`, given the text and boundaries. + /// + /// # Arguments + /// * `text` - The `Proteintext` representing the text of proteins with 5 bits per amino acid. + /// * `start` - The start of the slice. + /// * `end` - The end of the slice. + /// + /// # Returns + /// + /// An instance of `ProteinTextSlice` pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice { Self {text, start, end } } + /// Get a character (amino acid) in the slice. + /// + /// # Arguments + /// * `index` - The index in the slice of the character to get. + /// + /// # Returns + /// + /// The character as `u8`. pub fn get(&self, index: usize) -> u8 { self.text.get(self.start + index) } + /// Get the length of the slice. + /// + /// # Returns + /// + /// The length of the slice. pub fn len(&self) -> usize { self.end - self.start } + /// Checks if the slice and a given array of `u8` are equal. + /// I and L can be equated. + /// + /// # Arguments + /// * `other` - the array of `u8` to compare the slice with. + /// * `equate_il` - true if I and L need to be equated, false otherwise. + /// + /// # Returns + /// + /// True if the slice is equal to the given array, false otherwise. #[inline] pub fn equals_slice(&self, other: &[u8], equate_il: bool) -> bool { if equate_il { @@ -133,6 +249,16 @@ impl<'a> ProteinTextSlice<'a> { } } + /// Check if the slice and a given array of `u8` are equal on the I and L positions. + /// + /// # Arguments + /// * `skip` - The amount of positions this slice skipped, this has an influence on the I and L positions. + /// * `il_locations` - The positions where I and L occur. + /// * `search_string` - An array of `u8` to compare the slice with. + /// + /// # Returns + /// + /// True if the slice and `search_string` have the same contents on the I and L positions, false otherwise. pub fn check_il_locations( &self, skip: usize, @@ -148,16 +274,23 @@ impl<'a> ProteinTextSlice<'a> { true } + /// Get an iterator over the slice. + /// + /// # Returns + /// + /// An iterator over the slice. pub fn iter(&self) -> ProteinTextSliceIterator { ProteinTextSliceIterator {text_slice: self, index: 0, } } } +/// Structure representing an iterator over a `ProteinText` instance, iterating the characters of the text. pub struct ProteinTextIterator<'a> { protein_text: &'a ProteinText, index: usize, } +/// Structure representing an iterator over a `ProteintextSlice` instance, iterating the characters of the slice. pub struct ProteinTextSliceIterator<'a> { text_slice: &'a ProteinTextSlice<'a>, index: usize, @@ -167,6 +300,11 @@ impl<'a> Iterator for ProteinTextSliceIterator<'a> { type Item = u8; + /// Get the next character in the `ProteinTextSlice`. + /// + /// # Returns + /// + /// The next character in the slice. fn next(&mut self) -> Option { if self.index >= self.text_slice.len() { return None; @@ -181,6 +319,11 @@ impl<'a> Iterator for ProteinTextIterator<'a> { type Item = u8; + /// Get the next character in the `ProteinText`. + /// + /// # Returns + /// + /// The next character in the text. fn next(&mut self) -> Option { if self.index >= self.protein_text.len() { return None; @@ -257,7 +400,7 @@ pub fn load_compressed_text( #[cfg(test)] mod tests { - use std::{char, io::Read}; + use std::io::Read; use super::*; From 7e17bd66c39005075f1e8bf1ef2fc48afeb26256 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 15:54:47 +0200 Subject: [PATCH 20/27] remove trailing space --- sa-index/src/sa_searcher.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 4f4522e..d2250ed 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -343,7 +343,7 @@ impl Searcher { && ((skip == 0 || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix) .equals_slice(current_search_string_prefix, equate_il)) // Check the prefix - && + && Self::check_suffix( skip, il_locations_current_suffix, From 4be9148a5e60d1bf44e93d8c864caaa20873ade3 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 15:56:55 +0200 Subject: [PATCH 21/27] cargo fmt to format code --- bitarray/src/binary.rs | 11 +- bitarray/src/lib.rs | 78 ++++++++------ fa-compression/benches/algorithm1/decode.rs | 2 +- fa-compression/benches/algorithm1/encode.rs | 2 +- fa-compression/benches/algorithm2/decode.rs | 2 +- fa-compression/benches/algorithm2/encode.rs | 2 +- fa-compression/benches/util.rs | 2 +- fa-compression/src/algorithm1/encode.rs | 21 ++-- fa-compression/src/algorithm1/mod.rs | 8 +- fa-compression/src/algorithm2/encode.rs | 7 +- fa-compression/src/algorithm2/mod.rs | 4 +- libsais64-rs/builder.rs | 8 +- libsais64-rs/src/lib.rs | 6 +- sa-builder/src/lib.rs | 10 +- sa-builder/src/main.rs | 4 +- sa-compression/src/lib.rs | 27 ++--- sa-index/src/binary.rs | 36 ++++--- sa-index/src/lib.rs | 10 +- sa-index/src/peptide_search.rs | 18 ++-- sa-index/src/sa_searcher.rs | 56 +++++----- sa-index/src/suffix_to_protein_index.rs | 12 +-- sa-mappings/src/proteins.rs | 18 ++-- sa-server/src/main.rs | 12 +-- text-compression/src/lib.rs | 111 +++++++++----------- 24 files changed, 243 insertions(+), 224 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index a8084d1..4ab535f 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -167,10 +167,13 @@ mod tests { let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); - assert_eq!(buffer, vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ]); + assert_eq!( + buffer, + vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ] + ); } #[test] diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index fe7b532..d58a60c 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -4,7 +4,7 @@ mod binary; use std::{ cmp::max, - io::{Result, Write} + io::{Result, Write}, }; /// Re-export the `Binary` trait. @@ -147,7 +147,6 @@ impl BitArray { pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] { &self.data[start_slice..end_slice] } - } /// Writes the data to a writer in a binary format using a bit array. This function is helpfull @@ -168,7 +167,7 @@ pub fn data_to_writer( data: Vec, bits_per_value: usize, max_capacity: usize, - writer: &mut impl Write + writer: &mut impl Write, ) -> Result<()> { // Update the max capacity to be a multiple of the greatest common divisor of the bits per value // and 64. This is done to ensure that the bit array can store the data entirely @@ -312,10 +311,13 @@ mod tests { data_to_writer(data, 40, 2, &mut writer).unwrap(); - assert_eq!(writer, vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ]); + assert_eq!( + writer, + vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ] + ); } #[test] @@ -334,23 +336,27 @@ mod tests { data_to_writer(data, 32, 8, &mut writer).unwrap(); - assert_eq!(writer, vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, - 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, - 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, - 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, - 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, - 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, - 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, - 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, - 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, - 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, - 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, - 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, - 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, - 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff - ]); + assert_eq!( + writer, + vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, + 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, + 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, + 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, + 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, + 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, + 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, + 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, + 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, + 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, + 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, + 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, + 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, + 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, + 0xff + ] + ); } #[test] @@ -365,16 +371,20 @@ mod tests { data_to_writer(data, 32, 8, &mut writer).unwrap(); - assert_eq!(writer, vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, - 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, - 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, - 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, - 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, - 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, - 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, - 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, 0x33, 0x33, 0x33, 0x33 - ]); + assert_eq!( + writer, + vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, + 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, + 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, + 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, + 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, + 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, + 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, + 0x00, 0x00, 0x00, 0x00, 0x33, 0x33, 0x33, 0x33 + ] + ); } #[test] diff --git a/fa-compression/benches/algorithm1/decode.rs b/fa-compression/benches/algorithm1/decode.rs index 24991dc..bd1c94d 100644 --- a/fa-compression/benches/algorithm1/decode.rs +++ b/fa-compression/benches/algorithm1/decode.rs @@ -22,7 +22,7 @@ pub fn decode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_encoded_annotations(100), |annotations| black_box(decode(annotations.as_slice())), - criterion::BatchSize::SmallInput + criterion::BatchSize::SmallInput, ) }); } diff --git a/fa-compression/benches/algorithm1/encode.rs b/fa-compression/benches/algorithm1/encode.rs index d3a9c86..1d23a6e 100644 --- a/fa-compression/benches/algorithm1/encode.rs +++ b/fa-compression/benches/algorithm1/encode.rs @@ -22,7 +22,7 @@ pub fn encode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_decoded_annotations(100), |annotations| black_box(encode(annotations.as_str())), - criterion::BatchSize::SmallInput + criterion::BatchSize::SmallInput, ) }); } diff --git a/fa-compression/benches/algorithm2/decode.rs b/fa-compression/benches/algorithm2/decode.rs index 4d562fc..62f8b9e 100644 --- a/fa-compression/benches/algorithm2/decode.rs +++ b/fa-compression/benches/algorithm2/decode.rs @@ -28,7 +28,7 @@ pub fn decode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_encoded_annotations_and_table(100), |(annotations, ct)| black_box(decode(annotations.as_slice(), ct)), - criterion::BatchSize::SmallInput + criterion::BatchSize::SmallInput, ) }); } diff --git a/fa-compression/benches/algorithm2/encode.rs b/fa-compression/benches/algorithm2/encode.rs index 827dd50..a69ed0e 100644 --- a/fa-compression/benches/algorithm2/encode.rs +++ b/fa-compression/benches/algorithm2/encode.rs @@ -26,7 +26,7 @@ pub fn encode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_decoded_annotations_and_table(100), |(annotations, ct)| black_box(encode(annotations.as_str(), ct)), - criterion::BatchSize::SmallInput + criterion::BatchSize::SmallInput, ) }); } diff --git a/fa-compression/benches/util.rs b/fa-compression/benches/util.rs index b6ddd9a..0e80765 100644 --- a/fa-compression/benches/util.rs +++ b/fa-compression/benches/util.rs @@ -27,6 +27,6 @@ pub fn generate_annotation(random: &mut ThreadRng) -> String { 0 => generate_ipr(random), 1 => generate_go(random), 2 => generate_ec(random), - _ => unreachable!() + _ => unreachable!(), } } diff --git a/fa-compression/src/algorithm1/encode.rs b/fa-compression/src/algorithm1/encode.rs index ef79372..9138be1 100644 --- a/fa-compression/src/algorithm1/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -106,16 +106,18 @@ mod tests { #[test] fn test_encode_no_ec() { - assert_eq!(encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), vec![ - 225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39 - ]) + assert_eq!( + encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), + vec![225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39] + ) } #[test] fn test_encode_no_go() { - assert_eq!(encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), vec![ - 44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80 - ]) + assert_eq!( + encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), + vec![44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80] + ) } #[test] @@ -125,8 +127,9 @@ mod tests { #[test] fn test_encode_all() { - assert_eq!(encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), vec![ - 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39 - ]) + assert_eq!( + encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), + vec![44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39] + ) } } diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs index cdf7283..8ea45c5 100644 --- a/fa-compression/src/algorithm1/mod.rs +++ b/fa-compression/src/algorithm1/mod.rs @@ -79,7 +79,7 @@ enum CharacterSet { Comma, /// Annotation separator - Semicolon + Semicolon, } impl Encode for CharacterSet { @@ -110,7 +110,7 @@ impl Encode for CharacterSet { b'n' => CharacterSet::Preliminary, b',' => CharacterSet::Comma, b';' => CharacterSet::Semicolon, - _ => panic!("Invalid character") + _ => panic!("Invalid character"), } } } @@ -143,7 +143,7 @@ impl Decode for CharacterSet { 13 => 'n', 14 => ',', 15 => ';', - _ => panic!("Invalid character") + _ => panic!("Invalid character"), } } } @@ -189,7 +189,7 @@ mod tests { CharacterSet::Point, CharacterSet::Preliminary, CharacterSet::Comma, - CharacterSet::Semicolon + CharacterSet::Semicolon, ]; #[test] diff --git a/fa-compression/src/algorithm2/encode.rs b/fa-compression/src/algorithm2/encode.rs index f55eb11..d60fe61 100644 --- a/fa-compression/src/algorithm2/encode.rs +++ b/fa-compression/src/algorithm2/encode.rs @@ -89,8 +89,9 @@ mod tests { #[test] fn test_encode_all() { let table = create_compresion_table(); - assert_eq!(encode("IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002", table), vec![ - 0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0 - ]) + assert_eq!( + encode("IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002", table), + vec![0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0] + ) } } diff --git a/fa-compression/src/algorithm2/mod.rs b/fa-compression/src/algorithm2/mod.rs index 8fc505a..117b87c 100644 --- a/fa-compression/src/algorithm2/mod.rs +++ b/fa-compression/src/algorithm2/mod.rs @@ -12,13 +12,13 @@ pub use encode::encode; /// Represents an entry in the compression table. #[doc(hidden)] pub struct CompressionTableEntry { - annotation: String + annotation: String, } /// Represents a compression table. pub struct CompressionTable { /// List of annotations in the compression table. - entries: Vec + entries: Vec, } impl CompressionTable { diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs index 5b3feb2..c6fc2d6 100644 --- a/libsais64-rs/builder.rs +++ b/libsais64-rs/builder.rs @@ -3,14 +3,14 @@ use std::{ error::Error, fmt::{Display, Formatter}, path::{Path, PathBuf}, - process::{Command, ExitStatus} + process::{Command, ExitStatus}, }; /// Custom error for compilation of the C library #[derive(Debug)] struct CompileError<'a> { command: &'a str, - exit_code: Option + exit_code: Option, } impl<'a> Display for CompileError<'a> { @@ -43,7 +43,7 @@ impl<'a> Error for CompileError<'a> {} fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), CompileError> { match exit_status.success() { true => Ok(()), - false => Err(CompileError { command: name, exit_code: exit_status.code() }) + false => Err(CompileError { command: name, exit_code: exit_status.code() }), } } @@ -61,7 +61,7 @@ fn main() -> Result<(), Box> { Command::new("rm").args(["libsais/CMakeCache.txt"]).status().unwrap_or_default(); // if removing fails, it is since the cmake cache did not exist, we just can ignore it exit_status_to_result( "cmake", - Command::new("cmake").args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"]).status()? + Command::new("cmake").args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"]).status()?, )?; exit_status_to_result("make", Command::new("make").args(["-C", "libsais"]).status()?)?; diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index e2a87f6..b2a1d3a 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -16,7 +16,11 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); pub fn sais64(text: &[u8]) -> Option> { let mut sa = vec![0; text.len()]; let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) }; - if exit_code == 0 { Some(sa) } else { None } + if exit_code == 0 { + Some(sa) + } else { + None + } } #[cfg(test)] diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index c0e13cd..f20ec27 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -21,14 +21,14 @@ pub struct Arguments { pub construction_algorithm: SAConstructionAlgorithm, /// If the suffix array should be compressed (default value true) #[arg(short, long, default_value_t = false)] - pub compress_sa: bool + pub compress_sa: bool, } /// Enum representing the two possible algorithms to construct the suffix array #[derive(ValueEnum, Clone, Debug, PartialEq)] pub enum SAConstructionAlgorithm { LibDivSufSort, - LibSais + LibSais, } /// Build a sparse suffix array from the given text @@ -48,7 +48,7 @@ pub enum SAConstructionAlgorithm { pub fn build_ssa( text: &mut Vec, construction_algorithm: &SAConstructionAlgorithm, - sparseness_factor: u8 + sparseness_factor: u8, ) -> Result, Box> { // translate all L's to a I translate_l_to_i(text); @@ -56,7 +56,7 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(text), - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) + SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text), } .ok_or("Building suffix array failed")?; @@ -125,7 +125,7 @@ mod tests { "2", "--construction-algorithm", "lib-div-suf-sort", - "--compress-sa" + "--compress-sa", ]); assert_eq!(args.database_file, "database.fa"); diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 01cc3c4..20f2e8a 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,7 +1,7 @@ use std::{ fs::{File, OpenOptions}, io::BufWriter, - time::{SystemTime, SystemTimeError, UNIX_EPOCH} + time::{SystemTime, SystemTimeError, UNIX_EPOCH}, }; use clap::Parser; @@ -16,7 +16,7 @@ fn main() { output, sparseness_factor, construction_algorithm, - compress_sa + compress_sa, } = Arguments::parse(); eprintln!(); eprintln!("📋 Started loading the proteins..."); diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index e9952a2..9814e20 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -1,6 +1,6 @@ use std::{ error::Error, - io::{BufRead, Write} + io::{BufRead, Write}, }; use bitarray::{data_to_writer, Binary, BitArray}; @@ -22,7 +22,7 @@ pub fn dump_compressed_suffix_array( sa: Vec, sparseness_factor: u8, bits_per_value: usize, - writer: &mut impl Write + writer: &mut impl Write, ) -> Result<(), Box> { // Write the flags to the writer // 00000001 indicates that the suffix array is compressed @@ -59,7 +59,7 @@ pub fn dump_compressed_suffix_array( /// Returns an error if reading from the reader fails. pub fn load_compressed_suffix_array( reader: &mut impl BufRead, - bits_per_value: usize + bits_per_value: usize, ) -> Result> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; @@ -92,7 +92,7 @@ mod tests { pub struct FailingWriter { /// The number of times the write function can be called before it fails. - pub valid_write_count: usize + pub valid_write_count: usize, } impl Write for FailingWriter { @@ -112,7 +112,7 @@ mod tests { pub struct FailingReader { /// The number of times the read function can be called before it fails. - pub valid_read_count: usize + pub valid_read_count: usize, } impl Read for FailingReader { @@ -141,13 +141,16 @@ mod tests { let mut writer = vec![]; dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); - assert_eq!(writer, vec![ - // bits per value - 8, // sparseness factor - 1, // size of the suffix array - 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array - 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 - ]); + assert_eq!( + writer, + vec![ + // bits per value + 8, // sparseness factor + 1, // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 + ] + ); } #[test] diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index 55c082a..fc41f24 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -1,6 +1,6 @@ use std::{ error::Error, - io::{BufRead, Read, Write} + io::{BufRead, Read, Write}, }; use crate::SuffixArray; @@ -190,7 +190,7 @@ mod tests { pub struct FailingWriter { /// The number of times the write function can be called before it fails. - pub valid_write_count: usize + pub valid_write_count: usize, } impl Write for FailingWriter { @@ -210,7 +210,7 @@ mod tests { pub struct FailingReader { /// The number of times the read function can be called before it fails. - pub valid_read_count: usize + pub valid_read_count: usize, } impl Read for FailingReader { @@ -266,10 +266,13 @@ mod tests { values.write_binary(&mut buffer).unwrap(); - assert_eq!(buffer, vec![ - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, - 0, 0, 0, 0 - ]); + assert_eq!( + buffer, + vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, + 0, 0, 0, 0, 0, 0 + ] + ); } #[test] @@ -292,14 +295,17 @@ mod tests { dump_suffix_array(&sa, 1, &mut buffer).unwrap(); - assert_eq!(buffer, vec![ - // required bits - 64, // Sparseness factor - 1, // Size of the suffix array - 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, - 0, 0, 0, 0 - ]); + assert_eq!( + buffer, + vec![ + // required bits + 64, // Sparseness factor + 1, // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, + 0, 0, 0, 0, 0, 0 + ] + ); } #[test] diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index 53f5348..a43168d 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -10,7 +10,7 @@ pub enum SuffixArray { /// The original suffix array. Original(Vec, u8), /// The compressed suffix array. - Compressed(BitArray, u8) + Compressed(BitArray, u8), } impl SuffixArray { @@ -22,7 +22,7 @@ impl SuffixArray { pub fn len(&self) -> usize { match self { SuffixArray::Original(sa, _) => sa.len(), - SuffixArray::Compressed(sa, _) => sa.len() + SuffixArray::Compressed(sa, _) => sa.len(), } } @@ -34,7 +34,7 @@ impl SuffixArray { pub fn bits_per_value(&self) -> usize { match self { SuffixArray::Original(_, _) => 64, - SuffixArray::Compressed(sa, _) => sa.bits_per_value() + SuffixArray::Compressed(sa, _) => sa.bits_per_value(), } } @@ -46,7 +46,7 @@ impl SuffixArray { pub fn sample_rate(&self) -> u8 { match self { SuffixArray::Original(_, sample_rate) => *sample_rate, - SuffixArray::Compressed(_, sample_rate) => *sample_rate + SuffixArray::Compressed(_, sample_rate) => *sample_rate, } } @@ -62,7 +62,7 @@ impl SuffixArray { pub fn get(&self, index: usize) -> i64 { match self { SuffixArray::Original(sa, _) => sa[index], - SuffixArray::Compressed(sa, _) => sa.get(index) as i64 + SuffixArray::Compressed(sa, _) => sa.get(index) as i64, } } diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 55d629f..02e4975 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -8,7 +8,7 @@ use crate::sa_searcher::{SearchAllSuffixesResult, Searcher}; pub struct SearchResult { pub sequence: String, pub proteins: Vec, - pub cutoff_used: bool + pub cutoff_used: bool, } /// Struct that represents all information known about a certain protein in our database @@ -16,7 +16,7 @@ pub struct SearchResult { pub struct ProteinInfo { pub taxon: u32, pub uniprot_accession: String, - pub functional_annotations: String + pub functional_annotations: String, } impl From<&Protein> for ProteinInfo { @@ -24,7 +24,7 @@ impl From<&Protein> for ProteinInfo { ProteinInfo { taxon: protein.taxon_id, uniprot_accession: protein.uniprot_id.clone(), - functional_annotations: protein.get_functional_annotations() + functional_annotations: protein.get_functional_annotations(), } } } @@ -50,7 +50,7 @@ pub fn search_proteins_for_peptide<'a>( searcher: &'a Searcher, peptide: &str, cutoff: usize, - equate_il: bool + equate_il: bool, ) -> Option<(bool, Vec<&'a Protein>)> { let peptide = peptide.trim_end().to_uppercase(); @@ -63,7 +63,7 @@ pub fn search_proteins_for_peptide<'a>( let (suffixes, cutoff_used) = match suffix_search { SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)), SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)), - SearchAllSuffixesResult::NoMatches => None + SearchAllSuffixesResult::NoMatches => None, }?; let proteins = searcher.retrieve_proteins(&suffixes); @@ -77,7 +77,7 @@ pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_ Some(SearchResult { sequence: peptide.to_string(), proteins: proteins.iter().map(|&protein| protein.into()).collect(), - cutoff_used + cutoff_used, }) } @@ -99,7 +99,7 @@ pub fn search_all_peptides( searcher: &Searcher, peptides: &Vec, cutoff: usize, - equate_il: bool + equate_il: bool, ) -> Vec { peptides .par_iter() @@ -123,7 +123,7 @@ mod tests { let protein_info = ProteinInfo { taxon: 1, uniprot_accession: "P12345".to_string(), - functional_annotations: "GO:0001234;GO:0005678".to_string() + functional_annotations: "GO:0001234;GO:0005678".to_string(), }; let generated_json = serde_json::to_string(&protein_info).unwrap(); @@ -138,7 +138,7 @@ mod tests { let search_result = SearchResult { sequence: "MSKIAALLPSV".to_string(), proteins: vec![], - cutoff_used: true + cutoff_used: true, }; let generated_json = serde_json::to_string(&search_result).unwrap(); diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index d2250ed..119af6c 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -6,21 +6,21 @@ use text_compression::ProteinTextSlice; use crate::{ sa_searcher::BoundSearch::{Maximum, Minimum}, suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}, - Nullable, SuffixArray + Nullable, SuffixArray, }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array #[derive(Clone, Copy, PartialEq)] enum BoundSearch { Minimum, - Maximum + Maximum, } /// Enum representing the minimum and maximum bound of the found matches in the suffix array #[derive(PartialEq, Debug)] pub enum BoundSearchResult { NoMatches, - SearchResult((usize, usize)) + SearchResult((usize, usize)), } /// Enum representing the matching suffixes after searching a peptide in the suffix array @@ -30,7 +30,7 @@ pub enum BoundSearchResult { pub enum SearchAllSuffixesResult { NoMatches, MaxMatches(Vec), - SearchResult(Vec) + SearchResult(Vec), } /// Custom implementation of partialEq for SearchAllSuffixesResult @@ -67,7 +67,7 @@ impl PartialEq for SearchAllSuffixesResult { array_eq_unordered(arr1, arr2) } (SearchAllSuffixesResult::NoMatches, SearchAllSuffixesResult::NoMatches) => true, - _ => false + _ => false, } } } @@ -123,7 +123,7 @@ impl Deref for DenseSearcher { pub struct Searcher { pub sa: SuffixArray, pub proteins: Proteins, - pub suffix_index_to_protein: Box + pub suffix_index_to_protein: Box, } impl Searcher { @@ -172,7 +172,7 @@ impl Searcher { // Depending on if we are searching for the min of max bound our condition is different let condition_check = match bound { Minimum => |a: u8, b: u8| a < b, - Maximum => |a: u8, b: u8| a > b + Maximum => |a: u8, b: u8| a > b, }; // match as long as possible @@ -265,7 +265,7 @@ impl Searcher { match bound { Minimum => (found, right), - Maximum => (found, left) + Maximum => (found, left), } } @@ -307,7 +307,7 @@ impl Searcher { &self, search_string: &[u8], max_matches: usize, - equate_il: bool + equate_il: bool, ) -> SearchAllSuffixesResult { let mut matching_suffixes: Vec = vec![]; let mut il_locations = vec![]; @@ -394,7 +394,7 @@ impl Searcher { il_locations: &[usize], search_string: &[u8], text_slice: ProteinTextSlice, - equate_il: bool + equate_il: bool, ) -> bool { if equate_il { true @@ -432,7 +432,7 @@ mod tests { use crate::{ sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, suffix_to_protein_index::SparseSuffixToProtein, - SuffixArray + SuffixArray, }; #[test] @@ -465,24 +465,24 @@ mod tests { Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] + functional_annotations: vec![], }, Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] + functional_annotations: vec![], }, Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] + functional_annotations: vec![], }, Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] + functional_annotations: vec![], }, - ] + ], } } @@ -568,8 +568,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] - }] + functional_annotations: vec![], + }], }; let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); @@ -591,8 +591,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] - }] + functional_annotations: vec![], + }], }; let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); @@ -613,10 +613,10 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] - }] + functional_annotations: vec![], + }], }; - + let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); let suffix_index_to_protein = SparseSuffixToProtein::new(&proteins.text); let searcher = Searcher::new(sparse_sa, proteins, Box::new(suffix_index_to_protein)); @@ -629,14 +629,14 @@ mod tests { fn test_il_suffix_check() { let input_string = "IIIILL$"; let text = ProteinText::from_string(input_string); - + let proteins = Proteins { text, proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] - }] + functional_annotations: vec![], + }], }; let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); @@ -659,8 +659,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![] - }] + functional_annotations: vec![], + }], }; let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 6aed362..1a224d2 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -8,7 +8,7 @@ use text_compression::ProteinText; #[derive(ValueEnum, Clone, Debug, PartialEq)] pub enum SuffixToProteinMappingStyle { Dense, - Sparse + Sparse, } /// Trait implemented by the SuffixToProtein mappings @@ -29,14 +29,14 @@ pub trait SuffixToProteinIndex: Send + Sync { #[derive(Debug, PartialEq)] pub struct DenseSuffixToProtein { // UniProtKB does not have more that u32::MAX proteins, so a larger type is not needed - mapping: Vec + mapping: Vec, } /// Mapping that uses O(m) memory with m the number of proteins, but retrieval of the protein is /// O(log m) #[derive(Debug, PartialEq)] pub struct SparseSuffixToProtein { - mapping: Vec + mapping: Vec, } impl SuffixToProteinIndex for DenseSuffixToProtein { @@ -113,9 +113,9 @@ mod tests { use crate::{ suffix_to_protein_index::{ - DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle + DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle, }, - Nullable + Nullable, }; fn build_text() -> ProteinText { @@ -138,7 +138,7 @@ mod tests { let u8_text = &build_text(); let index = DenseSuffixToProtein::new(u8_text); let expected = DenseSuffixToProtein { - mapping: vec![0, 0, 0, u32::NULL, 1, 1, u32::NULL, 2, 2, 2, u32::NULL] + mapping: vec![0, 0, 0, u32::NULL, 1, 1, u32::NULL, 2, 2, 2, u32::NULL], }; assert_eq!(index, expected); } diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index 626ead3..9285980 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -23,7 +23,7 @@ pub struct Protein { pub taxon_id: u32, /// The encoded functional annotations of the protein - pub functional_annotations: Vec + pub functional_annotations: Vec, } /// A struct that represents a collection of proteins @@ -32,7 +32,7 @@ pub struct Proteins { pub text: ProteinText, /// The proteins in the input string - pub proteins: Vec + pub proteins: Vec, } impl Protein { @@ -80,7 +80,7 @@ impl Proteins { proteins.push(Protein { uniprot_id: uniprot_id.to_string(), taxon_id, - functional_annotations + functional_annotations, }); } @@ -126,7 +126,6 @@ impl Proteins { let text = ProteinText::from_string(&input_string); Ok(text) - } /// Creates a `vec` which represents all the proteins concatenated from the database file @@ -165,7 +164,6 @@ impl Proteins { input_string.shrink_to_fit(); Ok(input_string.into_bytes()) - } } @@ -197,7 +195,7 @@ mod tests { .unwrap(); file.write( "P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n" - .as_bytes() + .as_bytes(), ) .unwrap(); @@ -209,7 +207,7 @@ mod tests { let protein = Protein { uniprot_id: "P12345".to_string(), taxon_id: 1, - functional_annotations: vec![0xD1, 0x11] + functional_annotations: vec![0xD1, 0x11], }; assert_eq!(protein.uniprot_id, "P12345"); @@ -227,14 +225,14 @@ mod tests { Protein { uniprot_id: "P12345".to_string(), taxon_id: 1, - functional_annotations: vec![0xD1, 0x11] + functional_annotations: vec![0xD1, 0x11], }, Protein { uniprot_id: "P54321".to_string(), taxon_id: 2, - functional_annotations: vec![0xD1, 0x11] + functional_annotations: vec![0xD1, 0x11], }, - ] + ], }; assert_eq!(proteins.proteins.len(), 2); diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index 5284546..c65ba7c 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -2,14 +2,14 @@ use std::{ error::Error, fs::File, io::{BufReader, Read}, - sync::Arc + sync::Arc, }; use axum::{ extract::{DefaultBodyLimit, State}, http::StatusCode, routing::post, - Json, Router + Json, Router, }; use clap::Parser; use sa_compression::load_compressed_suffix_array; @@ -17,7 +17,7 @@ use sa_index::{ binary::load_suffix_array, peptide_search::{search_all_peptides, SearchResult}, sa_searcher::SparseSearcher, - SuffixArray + SuffixArray, }; use sa_mappings::proteins::Proteins; use serde::Deserialize; @@ -30,7 +30,7 @@ pub struct Arguments { #[arg(short, long)] database_file: String, #[arg(short, long)] - index_file: String + index_file: String, } /// Function used by serde to place a default value in the cutoff field of the input @@ -58,7 +58,7 @@ struct InputData { cutoff: usize, #[serde(default = "bool::default")] // default value is false // TODO: maybe default should be true? - equate_il: bool + equate_il: bool, } #[tokio::main] @@ -81,7 +81,7 @@ async fn main() { /// Returns the search results from the index as a JSON async fn search( State(searcher): State>, - data: Json + data: Json, ) -> Result>, StatusCode> { let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il); diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index 6bfaf1a..dc7f71e 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -1,8 +1,8 @@ +use std::collections::HashMap; use std::{ error::Error, - io::{BufRead, Write} + io::{BufRead, Write}, }; -use std::collections::HashMap; use bitarray::{data_to_writer, Binary, BitArray}; @@ -17,7 +17,6 @@ pub struct ProteinText { } impl ProteinText { - /// Creates the hashmap storing the mappings between the characters as `u8` and 5 bit numbers. /// /// # Returns @@ -44,9 +43,9 @@ impl ProteinText { } vec } - + /// Creates the compressed text from a string. - /// + /// /// # Arguments /// * `input_string` - The text (proteins) in string format /// @@ -67,7 +66,7 @@ impl ProteinText { } /// Creates the compressed text from a vector. - /// + /// /// # Arguments /// * `input_vec` - The text (proteins) in a vector with elements of type `u8` representing the amino acids. /// @@ -88,7 +87,7 @@ impl ProteinText { } /// Creates the compressed text from a bit array. - /// + /// /// # Arguments /// * `bit_array` - The text (proteins) in a bit array using 5 bits for each amino acid. /// @@ -102,7 +101,7 @@ impl ProteinText { } /// Creates an instance of `ProteinText` with a given capacity. - /// + /// /// # Arguments /// * `capacity` - The amount of characters in the text. /// @@ -114,7 +113,7 @@ impl ProteinText { } /// Search the character at a given position in the compressed text. - /// + /// /// # Arguments /// * `index` - The index of the character to search. /// @@ -127,7 +126,7 @@ impl ProteinText { } /// Set the character at a given index. - /// + /// /// # Arguments /// * `index` - The index of the character to change. /// * `value` - The character to fill in as `u8`. @@ -139,7 +138,7 @@ impl ProteinText { /// Queries the length of the text. /// /// # Returns - /// + /// /// the length of the text pub fn len(&self) -> usize { self.bit_array.len() @@ -148,7 +147,7 @@ impl ProteinText { /// Check if the text is empty (length 0). /// /// # Returns - /// + /// /// true if the the text has length 0, false otherwise. pub fn is_empty(&self) -> bool { self.bit_array.len() == 0 @@ -162,21 +161,20 @@ impl ProteinText { /// Get an iterator over the characters of the text. /// /// # Returns - /// + /// /// A `ProteinTextIterator`, which can iterate over the characters of the text. pub fn iter(&self) -> ProteinTextIterator { - ProteinTextIterator {protein_text: self, index: 0, } + ProteinTextIterator { protein_text: self, index: 0 } } /// Get a slice of the text /// /// # Returns - /// + /// /// An `ProteinTextSlice` representing a slice of the text. - pub fn slice(&self, start: usize, end:usize) -> ProteinTextSlice { + pub fn slice(&self, start: usize, end: usize) -> ProteinTextSlice { ProteinTextSlice::new(self, start, end) } - } /// Structure representing a slice of a `ProteinText`. @@ -186,13 +184,12 @@ pub struct ProteinTextSlice<'a> { /// The start of the slice. start: usize, // included /// The end of the slice. - end: usize, // excluded + end: usize, // excluded } impl<'a> ProteinTextSlice<'a> { - /// Creates an instance of `ProteintextSlice`, given the text and boundaries. - /// + /// /// # Arguments /// * `text` - The `Proteintext` representing the text of proteins with 5 bits per amino acid. /// * `start` - The start of the slice. @@ -202,11 +199,11 @@ impl<'a> ProteinTextSlice<'a> { /// /// An instance of `ProteinTextSlice` pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice { - Self {text, start, end } + Self { text, start, end } } /// Get a character (amino acid) in the slice. - /// + /// /// # Arguments /// * `index` - The index in the slice of the character to get. /// @@ -228,7 +225,7 @@ impl<'a> ProteinTextSlice<'a> { /// Checks if the slice and a given array of `u8` are equal. /// I and L can be equated. - /// + /// /// # Arguments /// * `other` - the array of `u8` to compare the slice with. /// * `equate_il` - true if I and L need to be equated, false otherwise. @@ -245,12 +242,15 @@ impl<'a> ProteinTextSlice<'a> { || (search_character == b'L' && text_character == b'I') }) } else { - other.iter().zip(self.iter()).all(|(&search_character, text_character)| search_character == text_character) + other + .iter() + .zip(self.iter()) + .all(|(&search_character, text_character)| search_character == text_character) } } /// Check if the slice and a given array of `u8` are equal on the I and L positions. - /// + /// /// # Arguments /// * `skip` - The amount of positions this slice skipped, this has an influence on the I and L positions. /// * `il_locations` - The positions where I and L occur. @@ -259,12 +259,7 @@ impl<'a> ProteinTextSlice<'a> { /// # Returns /// /// True if the slice and `search_string` have the same contents on the I and L positions, false otherwise. - pub fn check_il_locations( - &self, - skip: usize, - il_locations: &[usize], - search_string: &[u8], - ) -> bool { + pub fn check_il_locations(&self, skip: usize, il_locations: &[usize], search_string: &[u8]) -> bool { for &il_location in il_locations { let index = il_location - skip; if search_string[index] != self.get(index) { @@ -280,7 +275,7 @@ impl<'a> ProteinTextSlice<'a> { /// /// An iterator over the slice. pub fn iter(&self) -> ProteinTextSliceIterator { - ProteinTextSliceIterator {text_slice: self, index: 0, } + ProteinTextSliceIterator { text_slice: self, index: 0 } } } @@ -297,13 +292,12 @@ pub struct ProteinTextSliceIterator<'a> { } impl<'a> Iterator for ProteinTextSliceIterator<'a> { - type Item = u8; - + /// Get the next character in the `ProteinTextSlice`. - /// + /// /// # Returns - /// + /// /// The next character in the slice. fn next(&mut self) -> Option { if self.index >= self.text_slice.len() { @@ -316,13 +310,12 @@ impl<'a> Iterator for ProteinTextSliceIterator<'a> { } impl<'a> Iterator for ProteinTextIterator<'a> { - type Item = u8; - + /// Get the next character in the `ProteinText`. - /// + /// /// # Returns - /// + /// /// The next character in the text. fn next(&mut self) -> Option { if self.index >= self.protein_text.len() { @@ -344,10 +337,7 @@ impl<'a> Iterator for ProteinTextIterator<'a> { /// # Errors /// /// Returns an error if writing to the writer fails. -pub fn dump_compressed_text( - text: Vec, - writer: &mut impl Write -) -> Result<(), Box> { +pub fn dump_compressed_text(text: Vec, writer: &mut impl Write) -> Result<(), Box> { let bits_per_value = 5; // Write the flags to the writer @@ -378,9 +368,7 @@ pub fn dump_compressed_text( /// # Errors /// /// Returns an error if reading from the reader fails. -pub fn load_compressed_text( - reader: &mut impl BufRead -) -> Result> { +pub fn load_compressed_text(reader: &mut impl BufRead) -> Result> { let bits_per_value: usize = 5; // Read the size of the text from the binary file (8 bytes) let mut size_buffer = [0_u8; 8]; @@ -406,7 +394,7 @@ mod tests { pub struct FailingWriter { /// The number of times the write function can be called before it fails. - pub valid_write_count: usize + pub valid_write_count: usize, } impl Write for FailingWriter { @@ -426,7 +414,7 @@ mod tests { pub struct FailingReader { /// The number of times the read function can be called before it fails. - pub valid_read_count: usize + pub valid_read_count: usize, } impl Read for FailingReader { @@ -514,7 +502,7 @@ mod tests { fn test_text_slice() { let input_string = "ACACA-CAC$"; let start = 1; - let end = 5; + let end = 5; let text = ProteinText::from_string(&input_string); let text_slice = text.slice(start, end); @@ -533,7 +521,7 @@ mod tests { let eq_slice_il_true = [b'C', b'L', b'C', b'A']; assert!(text_slice.equals_slice(&eq_slice_true, false)); - assert!(! text_slice.equals_slice(&eq_slice_false, false)); + assert!(!text_slice.equals_slice(&eq_slice_false, false)); assert!(text_slice.equals_slice(&eq_slice_il_true, true)); } @@ -547,7 +535,7 @@ mod tests { let il_false = [b'C', b'I', b'C', b'A']; assert!(text_slice.check_il_locations(0, &il_locations, &il_true)); - assert!(! text_slice.check_il_locations(0, &il_locations, &il_false)); + assert!(!text_slice.check_il_locations(0, &il_locations, &il_false)); } #[test] @@ -557,12 +545,15 @@ mod tests { let mut writer = vec![]; dump_compressed_text(text, &mut writer).unwrap(); - assert_eq!(writer, vec![ - // bits per value - 5, // size of the text - 10, 0, 0, 0, 0, 0, 0, 0, // compressed text - 0, 128, 74, 232, 152, 66, 134, 8 - ]); + assert_eq!( + writer, + vec![ + // bits per value + 5, // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8 + ] + ); } #[test] @@ -592,9 +583,9 @@ mod tests { #[test] fn test_load_compressed_text() { let data = vec![ - // size of the text + // size of the text 10, 0, 0, 0, 0, 0, 0, 0, // compressed text - 0, 128, 74, 232, 152, 66, 134, 8 + 0, 128, 74, 232, 152, 66, 134, 8, ]; let mut reader = std::io::BufReader::new(&data[..]); From d1d8f88fd6e799a216a69acd91e9a7edfec6c54f Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 16:22:17 +0200 Subject: [PATCH 22/27] cargo fmt to format code --- bitarray/src/binary.rs | 11 +-- bitarray/src/lib.rs | 81 +++++++++------------ fa-compression/benches/algorithm1/decode.rs | 2 +- fa-compression/benches/algorithm1/encode.rs | 2 +- fa-compression/benches/algorithm2/decode.rs | 2 +- fa-compression/benches/algorithm2/encode.rs | 2 +- fa-compression/benches/util.rs | 2 +- fa-compression/src/algorithm1/encode.rs | 21 +++--- fa-compression/src/algorithm1/mod.rs | 8 +- fa-compression/src/algorithm2/encode.rs | 7 +- fa-compression/src/algorithm2/mod.rs | 4 +- libsais64-rs/builder.rs | 8 +- libsais64-rs/src/lib.rs | 6 +- sa-builder/src/lib.rs | 10 +-- sa-builder/src/main.rs | 4 +- sa-compression/src/lib.rs | 27 +++---- sa-index/src/binary.rs | 36 ++++----- sa-index/src/lib.rs | 10 +-- sa-index/src/peptide_search.rs | 18 ++--- sa-index/src/sa_searcher.rs | 58 +++++++-------- sa-index/src/suffix_to_protein_index.rs | 14 ++-- sa-mappings/src/proteins.rs | 16 ++-- sa-server/src/main.rs | 12 +-- text-compression/src/lib.rs | 31 ++++---- 24 files changed, 177 insertions(+), 215 deletions(-) diff --git a/bitarray/src/binary.rs b/bitarray/src/binary.rs index 4ab535f..a8084d1 100644 --- a/bitarray/src/binary.rs +++ b/bitarray/src/binary.rs @@ -167,13 +167,10 @@ mod tests { let mut buffer = Vec::new(); bitarray.write_binary(&mut buffer).unwrap(); - assert_eq!( - buffer, - vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, - 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ] - ); + assert_eq!(buffer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); } #[test] diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index d58a60c..e4bd8a2 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -4,7 +4,7 @@ mod binary; use std::{ cmp::max, - io::{Result, Write}, + io::{Result, Write} }; /// Re-export the `Binary` trait. @@ -19,7 +19,7 @@ pub struct BitArray { /// The length of the bit array. len: usize, /// The number of bits in a single element of the data vector. - bits_per_value: usize, + bits_per_value: usize } impl BitArray { @@ -39,7 +39,7 @@ impl BitArray { data: vec![0; capacity * bits_per_value / 64 + extra], mask: (1 << bits_per_value) - 1, len: capacity, - bits_per_value, + bits_per_value } } @@ -167,7 +167,7 @@ pub fn data_to_writer( data: Vec, bits_per_value: usize, max_capacity: usize, - writer: &mut impl Write, + writer: &mut impl Write ) -> Result<()> { // Update the max capacity to be a multiple of the greatest common divisor of the bits per value // and 64. This is done to ensure that the bit array can store the data entirely @@ -311,13 +311,10 @@ mod tests { data_to_writer(data, 40, 2, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, - 0x00, 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 - ] - ); + assert_eq!(writer, vec![ + 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xde, 0xbc, 0x0a, 0x89, 0x67, 0x45, 0x23, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x56, 0x34, 0x12, 0xf0 + ]); } #[test] @@ -336,27 +333,23 @@ mod tests { data_to_writer(data, 32, 8, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, - 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, - 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, - 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, - 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, - 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, - 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, - 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, - 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, - 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, - 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, - 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, - 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, - 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, - 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, - 0xff - ] - ); + assert_eq!(writer, vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, + 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, + 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, + 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, + 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, + 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, + 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, + 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, + 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, + 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, + 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, + 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff + ]); } #[test] @@ -371,20 +364,16 @@ mod tests { data_to_writer(data, 32, 8, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, - 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, - 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, - 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, - 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, - 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, - 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, - 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, - 0x00, 0x00, 0x00, 0x00, 0x33, 0x33, 0x33, 0x33 - ] - ); + assert_eq!(writer, vec![ + 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, + 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, + 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, + 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, + 0x44, 0x44, 0x44, 0x44, 0x33, 0x33, 0x33, 0x33, 0x66, 0x66, 0x66, 0x66, 0x55, 0x55, 0x55, 0x55, 0x88, 0x88, + 0x88, 0x88, 0x77, 0x77, 0x77, 0x77, 0xaa, 0xaa, 0xaa, 0xaa, 0x99, 0x99, 0x99, 0x99, 0xcc, 0xcc, 0xcc, 0xcc, + 0xbb, 0xbb, 0xbb, 0xbb, 0xee, 0xee, 0xee, 0xee, 0xdd, 0xdd, 0xdd, 0xdd, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x22, 0x22, 0x22, 0x22, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x00, 0x33, 0x33, 0x33, 0x33 + ]); } #[test] diff --git a/fa-compression/benches/algorithm1/decode.rs b/fa-compression/benches/algorithm1/decode.rs index bd1c94d..24991dc 100644 --- a/fa-compression/benches/algorithm1/decode.rs +++ b/fa-compression/benches/algorithm1/decode.rs @@ -22,7 +22,7 @@ pub fn decode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_encoded_annotations(100), |annotations| black_box(decode(annotations.as_slice())), - criterion::BatchSize::SmallInput, + criterion::BatchSize::SmallInput ) }); } diff --git a/fa-compression/benches/algorithm1/encode.rs b/fa-compression/benches/algorithm1/encode.rs index 1d23a6e..d3a9c86 100644 --- a/fa-compression/benches/algorithm1/encode.rs +++ b/fa-compression/benches/algorithm1/encode.rs @@ -22,7 +22,7 @@ pub fn encode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_decoded_annotations(100), |annotations| black_box(encode(annotations.as_str())), - criterion::BatchSize::SmallInput, + criterion::BatchSize::SmallInput ) }); } diff --git a/fa-compression/benches/algorithm2/decode.rs b/fa-compression/benches/algorithm2/decode.rs index 62f8b9e..4d562fc 100644 --- a/fa-compression/benches/algorithm2/decode.rs +++ b/fa-compression/benches/algorithm2/decode.rs @@ -28,7 +28,7 @@ pub fn decode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_encoded_annotations_and_table(100), |(annotations, ct)| black_box(decode(annotations.as_slice(), ct)), - criterion::BatchSize::SmallInput, + criterion::BatchSize::SmallInput ) }); } diff --git a/fa-compression/benches/algorithm2/encode.rs b/fa-compression/benches/algorithm2/encode.rs index a69ed0e..827dd50 100644 --- a/fa-compression/benches/algorithm2/encode.rs +++ b/fa-compression/benches/algorithm2/encode.rs @@ -26,7 +26,7 @@ pub fn encode_benchmark(c: &mut criterion::Criterion) { b.iter_batched( || generate_decoded_annotations_and_table(100), |(annotations, ct)| black_box(encode(annotations.as_str(), ct)), - criterion::BatchSize::SmallInput, + criterion::BatchSize::SmallInput ) }); } diff --git a/fa-compression/benches/util.rs b/fa-compression/benches/util.rs index 0e80765..b6ddd9a 100644 --- a/fa-compression/benches/util.rs +++ b/fa-compression/benches/util.rs @@ -27,6 +27,6 @@ pub fn generate_annotation(random: &mut ThreadRng) -> String { 0 => generate_ipr(random), 1 => generate_go(random), 2 => generate_ec(random), - _ => unreachable!(), + _ => unreachable!() } } diff --git a/fa-compression/src/algorithm1/encode.rs b/fa-compression/src/algorithm1/encode.rs index 9138be1..ef79372 100644 --- a/fa-compression/src/algorithm1/encode.rs +++ b/fa-compression/src/algorithm1/encode.rs @@ -106,18 +106,16 @@ mod tests { #[test] fn test_encode_no_ec() { - assert_eq!( - encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), - vec![225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39] - ) + assert_eq!(encode("IPR:IPR016364;GO:0009279;IPR:IPR008816"), vec![ + 225, 17, 163, 138, 225, 39, 71, 95, 17, 153, 39 + ]) } #[test] fn test_encode_no_go() { - assert_eq!( - encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), - vec![44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80] - ) + assert_eq!(encode("IPR:IPR016364;EC:1.1.1.-;EC:1.2.1.7"), vec![ + 44, 44, 44, 191, 44, 60, 44, 142, 225, 39, 71, 80 + ]) } #[test] @@ -127,9 +125,8 @@ mod tests { #[test] fn test_encode_all() { - assert_eq!( - encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), - vec![44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39] - ) + assert_eq!(encode("IPR:IPR016364;EC:1.1.1.-;IPR:IPR032635;GO:0009279;IPR:IPR008816"), vec![ + 44, 44, 44, 190, 17, 26, 56, 174, 18, 116, 117, 241, 67, 116, 111, 17, 153, 39 + ]) } } diff --git a/fa-compression/src/algorithm1/mod.rs b/fa-compression/src/algorithm1/mod.rs index 8ea45c5..cdf7283 100644 --- a/fa-compression/src/algorithm1/mod.rs +++ b/fa-compression/src/algorithm1/mod.rs @@ -79,7 +79,7 @@ enum CharacterSet { Comma, /// Annotation separator - Semicolon, + Semicolon } impl Encode for CharacterSet { @@ -110,7 +110,7 @@ impl Encode for CharacterSet { b'n' => CharacterSet::Preliminary, b',' => CharacterSet::Comma, b';' => CharacterSet::Semicolon, - _ => panic!("Invalid character"), + _ => panic!("Invalid character") } } } @@ -143,7 +143,7 @@ impl Decode for CharacterSet { 13 => 'n', 14 => ',', 15 => ';', - _ => panic!("Invalid character"), + _ => panic!("Invalid character") } } } @@ -189,7 +189,7 @@ mod tests { CharacterSet::Point, CharacterSet::Preliminary, CharacterSet::Comma, - CharacterSet::Semicolon, + CharacterSet::Semicolon ]; #[test] diff --git a/fa-compression/src/algorithm2/encode.rs b/fa-compression/src/algorithm2/encode.rs index d60fe61..f55eb11 100644 --- a/fa-compression/src/algorithm2/encode.rs +++ b/fa-compression/src/algorithm2/encode.rs @@ -89,9 +89,8 @@ mod tests { #[test] fn test_encode_all() { let table = create_compresion_table(); - assert_eq!( - encode("IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002", table), - vec![0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0] - ) + assert_eq!(encode("IPR:IPR000001;EC:1.1.1.-;IPR:IPR000003;GO:0000002", table), vec![ + 0, 0, 0, 7, 0, 0, 2, 0, 0, 5, 0, 0 + ]) } } diff --git a/fa-compression/src/algorithm2/mod.rs b/fa-compression/src/algorithm2/mod.rs index 117b87c..8fc505a 100644 --- a/fa-compression/src/algorithm2/mod.rs +++ b/fa-compression/src/algorithm2/mod.rs @@ -12,13 +12,13 @@ pub use encode::encode; /// Represents an entry in the compression table. #[doc(hidden)] pub struct CompressionTableEntry { - annotation: String, + annotation: String } /// Represents a compression table. pub struct CompressionTable { /// List of annotations in the compression table. - entries: Vec, + entries: Vec } impl CompressionTable { diff --git a/libsais64-rs/builder.rs b/libsais64-rs/builder.rs index c6fc2d6..5b3feb2 100644 --- a/libsais64-rs/builder.rs +++ b/libsais64-rs/builder.rs @@ -3,14 +3,14 @@ use std::{ error::Error, fmt::{Display, Formatter}, path::{Path, PathBuf}, - process::{Command, ExitStatus}, + process::{Command, ExitStatus} }; /// Custom error for compilation of the C library #[derive(Debug)] struct CompileError<'a> { command: &'a str, - exit_code: Option, + exit_code: Option } impl<'a> Display for CompileError<'a> { @@ -43,7 +43,7 @@ impl<'a> Error for CompileError<'a> {} fn exit_status_to_result(name: &str, exit_status: ExitStatus) -> Result<(), CompileError> { match exit_status.success() { true => Ok(()), - false => Err(CompileError { command: name, exit_code: exit_status.code() }), + false => Err(CompileError { command: name, exit_code: exit_status.code() }) } } @@ -61,7 +61,7 @@ fn main() -> Result<(), Box> { Command::new("rm").args(["libsais/CMakeCache.txt"]).status().unwrap_or_default(); // if removing fails, it is since the cmake cache did not exist, we just can ignore it exit_status_to_result( "cmake", - Command::new("cmake").args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"]).status()?, + Command::new("cmake").args(["-DCMAKE_BUILD_TYPE=\"Release\"", "libsais", "-Blibsais"]).status()? )?; exit_status_to_result("make", Command::new("make").args(["-C", "libsais"]).status()?)?; diff --git a/libsais64-rs/src/lib.rs b/libsais64-rs/src/lib.rs index b2a1d3a..e2a87f6 100644 --- a/libsais64-rs/src/lib.rs +++ b/libsais64-rs/src/lib.rs @@ -16,11 +16,7 @@ include!(concat!(env!("OUT_DIR"), "/bindings.rs")); pub fn sais64(text: &[u8]) -> Option> { let mut sa = vec![0; text.len()]; let exit_code = unsafe { libsais64(text.as_ptr(), sa.as_mut_ptr(), text.len() as i64, 0, std::ptr::null_mut()) }; - if exit_code == 0 { - Some(sa) - } else { - None - } + if exit_code == 0 { Some(sa) } else { None } } #[cfg(test)] diff --git a/sa-builder/src/lib.rs b/sa-builder/src/lib.rs index f20ec27..c0e13cd 100644 --- a/sa-builder/src/lib.rs +++ b/sa-builder/src/lib.rs @@ -21,14 +21,14 @@ pub struct Arguments { pub construction_algorithm: SAConstructionAlgorithm, /// If the suffix array should be compressed (default value true) #[arg(short, long, default_value_t = false)] - pub compress_sa: bool, + pub compress_sa: bool } /// Enum representing the two possible algorithms to construct the suffix array #[derive(ValueEnum, Clone, Debug, PartialEq)] pub enum SAConstructionAlgorithm { LibDivSufSort, - LibSais, + LibSais } /// Build a sparse suffix array from the given text @@ -48,7 +48,7 @@ pub enum SAConstructionAlgorithm { pub fn build_ssa( text: &mut Vec, construction_algorithm: &SAConstructionAlgorithm, - sparseness_factor: u8, + sparseness_factor: u8 ) -> Result, Box> { // translate all L's to a I translate_l_to_i(text); @@ -56,7 +56,7 @@ pub fn build_ssa( // Build the suffix array using the selected algorithm let mut sa = match construction_algorithm { SAConstructionAlgorithm::LibSais => libsais64_rs::sais64(text), - SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text), + SAConstructionAlgorithm::LibDivSufSort => libdivsufsort_rs::divsufsort64(text) } .ok_or("Building suffix array failed")?; @@ -125,7 +125,7 @@ mod tests { "2", "--construction-algorithm", "lib-div-suf-sort", - "--compress-sa", + "--compress-sa" ]); assert_eq!(args.database_file, "database.fa"); diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 20f2e8a..01cc3c4 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -1,7 +1,7 @@ use std::{ fs::{File, OpenOptions}, io::BufWriter, - time::{SystemTime, SystemTimeError, UNIX_EPOCH}, + time::{SystemTime, SystemTimeError, UNIX_EPOCH} }; use clap::Parser; @@ -16,7 +16,7 @@ fn main() { output, sparseness_factor, construction_algorithm, - compress_sa, + compress_sa } = Arguments::parse(); eprintln!(); eprintln!("📋 Started loading the proteins..."); diff --git a/sa-compression/src/lib.rs b/sa-compression/src/lib.rs index 9814e20..e9952a2 100644 --- a/sa-compression/src/lib.rs +++ b/sa-compression/src/lib.rs @@ -1,6 +1,6 @@ use std::{ error::Error, - io::{BufRead, Write}, + io::{BufRead, Write} }; use bitarray::{data_to_writer, Binary, BitArray}; @@ -22,7 +22,7 @@ pub fn dump_compressed_suffix_array( sa: Vec, sparseness_factor: u8, bits_per_value: usize, - writer: &mut impl Write, + writer: &mut impl Write ) -> Result<(), Box> { // Write the flags to the writer // 00000001 indicates that the suffix array is compressed @@ -59,7 +59,7 @@ pub fn dump_compressed_suffix_array( /// Returns an error if reading from the reader fails. pub fn load_compressed_suffix_array( reader: &mut impl BufRead, - bits_per_value: usize, + bits_per_value: usize ) -> Result> { // Read the sample rate from the binary file (1 byte) let mut sample_rate_buffer = [0_u8; 1]; @@ -92,7 +92,7 @@ mod tests { pub struct FailingWriter { /// The number of times the write function can be called before it fails. - pub valid_write_count: usize, + pub valid_write_count: usize } impl Write for FailingWriter { @@ -112,7 +112,7 @@ mod tests { pub struct FailingReader { /// The number of times the read function can be called before it fails. - pub valid_read_count: usize, + pub valid_read_count: usize } impl Read for FailingReader { @@ -141,16 +141,13 @@ mod tests { let mut writer = vec![]; dump_compressed_suffix_array(sa, 1, 8, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - // bits per value - 8, // sparseness factor - 1, // size of the suffix array - 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array - 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 - ] - ); + assert_eq!(writer, vec![ + // bits per value + 8, // sparseness factor + 1, // size of the suffix array + 10, 0, 0, 0, 0, 0, 0, 0, // compressed suffix array + 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 10, 9 + ]); } #[test] diff --git a/sa-index/src/binary.rs b/sa-index/src/binary.rs index fc41f24..55c082a 100644 --- a/sa-index/src/binary.rs +++ b/sa-index/src/binary.rs @@ -1,6 +1,6 @@ use std::{ error::Error, - io::{BufRead, Read, Write}, + io::{BufRead, Read, Write} }; use crate::SuffixArray; @@ -190,7 +190,7 @@ mod tests { pub struct FailingWriter { /// The number of times the write function can be called before it fails. - pub valid_write_count: usize, + pub valid_write_count: usize } impl Write for FailingWriter { @@ -210,7 +210,7 @@ mod tests { pub struct FailingReader { /// The number of times the read function can be called before it fails. - pub valid_read_count: usize, + pub valid_read_count: usize } impl Read for FailingReader { @@ -266,13 +266,10 @@ mod tests { values.write_binary(&mut buffer).unwrap(); - assert_eq!( - buffer, - vec![ - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, - 0, 0, 0, 0, 0, 0 - ] - ); + assert_eq!(buffer, vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, + 0, 0, 0, 0 + ]); } #[test] @@ -295,17 +292,14 @@ mod tests { dump_suffix_array(&sa, 1, &mut buffer).unwrap(); - assert_eq!( - buffer, - vec![ - // required bits - 64, // Sparseness factor - 1, // Size of the suffix array - 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array - 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, - 0, 0, 0, 0, 0, 0 - ] - ); + assert_eq!(buffer, vec![ + // required bits + 64, // Sparseness factor + 1, // Size of the suffix array + 5, 0, 0, 0, 0, 0, 0, 0, // Suffix array + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, + 0, 0, 0, 0 + ]); } #[test] diff --git a/sa-index/src/lib.rs b/sa-index/src/lib.rs index a43168d..53f5348 100644 --- a/sa-index/src/lib.rs +++ b/sa-index/src/lib.rs @@ -10,7 +10,7 @@ pub enum SuffixArray { /// The original suffix array. Original(Vec, u8), /// The compressed suffix array. - Compressed(BitArray, u8), + Compressed(BitArray, u8) } impl SuffixArray { @@ -22,7 +22,7 @@ impl SuffixArray { pub fn len(&self) -> usize { match self { SuffixArray::Original(sa, _) => sa.len(), - SuffixArray::Compressed(sa, _) => sa.len(), + SuffixArray::Compressed(sa, _) => sa.len() } } @@ -34,7 +34,7 @@ impl SuffixArray { pub fn bits_per_value(&self) -> usize { match self { SuffixArray::Original(_, _) => 64, - SuffixArray::Compressed(sa, _) => sa.bits_per_value(), + SuffixArray::Compressed(sa, _) => sa.bits_per_value() } } @@ -46,7 +46,7 @@ impl SuffixArray { pub fn sample_rate(&self) -> u8 { match self { SuffixArray::Original(_, sample_rate) => *sample_rate, - SuffixArray::Compressed(_, sample_rate) => *sample_rate, + SuffixArray::Compressed(_, sample_rate) => *sample_rate } } @@ -62,7 +62,7 @@ impl SuffixArray { pub fn get(&self, index: usize) -> i64 { match self { SuffixArray::Original(sa, _) => sa[index], - SuffixArray::Compressed(sa, _) => sa.get(index) as i64, + SuffixArray::Compressed(sa, _) => sa.get(index) as i64 } } diff --git a/sa-index/src/peptide_search.rs b/sa-index/src/peptide_search.rs index 02e4975..55d629f 100644 --- a/sa-index/src/peptide_search.rs +++ b/sa-index/src/peptide_search.rs @@ -8,7 +8,7 @@ use crate::sa_searcher::{SearchAllSuffixesResult, Searcher}; pub struct SearchResult { pub sequence: String, pub proteins: Vec, - pub cutoff_used: bool, + pub cutoff_used: bool } /// Struct that represents all information known about a certain protein in our database @@ -16,7 +16,7 @@ pub struct SearchResult { pub struct ProteinInfo { pub taxon: u32, pub uniprot_accession: String, - pub functional_annotations: String, + pub functional_annotations: String } impl From<&Protein> for ProteinInfo { @@ -24,7 +24,7 @@ impl From<&Protein> for ProteinInfo { ProteinInfo { taxon: protein.taxon_id, uniprot_accession: protein.uniprot_id.clone(), - functional_annotations: protein.get_functional_annotations(), + functional_annotations: protein.get_functional_annotations() } } } @@ -50,7 +50,7 @@ pub fn search_proteins_for_peptide<'a>( searcher: &'a Searcher, peptide: &str, cutoff: usize, - equate_il: bool, + equate_il: bool ) -> Option<(bool, Vec<&'a Protein>)> { let peptide = peptide.trim_end().to_uppercase(); @@ -63,7 +63,7 @@ pub fn search_proteins_for_peptide<'a>( let (suffixes, cutoff_used) = match suffix_search { SearchAllSuffixesResult::MaxMatches(matched_suffixes) => Some((matched_suffixes, true)), SearchAllSuffixesResult::SearchResult(matched_suffixes) => Some((matched_suffixes, false)), - SearchAllSuffixesResult::NoMatches => None, + SearchAllSuffixesResult::NoMatches => None }?; let proteins = searcher.retrieve_proteins(&suffixes); @@ -77,7 +77,7 @@ pub fn search_peptide(searcher: &Searcher, peptide: &str, cutoff: usize, equate_ Some(SearchResult { sequence: peptide.to_string(), proteins: proteins.iter().map(|&protein| protein.into()).collect(), - cutoff_used, + cutoff_used }) } @@ -99,7 +99,7 @@ pub fn search_all_peptides( searcher: &Searcher, peptides: &Vec, cutoff: usize, - equate_il: bool, + equate_il: bool ) -> Vec { peptides .par_iter() @@ -123,7 +123,7 @@ mod tests { let protein_info = ProteinInfo { taxon: 1, uniprot_accession: "P12345".to_string(), - functional_annotations: "GO:0001234;GO:0005678".to_string(), + functional_annotations: "GO:0001234;GO:0005678".to_string() }; let generated_json = serde_json::to_string(&protein_info).unwrap(); @@ -138,7 +138,7 @@ mod tests { let search_result = SearchResult { sequence: "MSKIAALLPSV".to_string(), proteins: vec![], - cutoff_used: true, + cutoff_used: true }; let generated_json = serde_json::to_string(&search_result).unwrap(); diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 119af6c..03abf07 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -6,21 +6,21 @@ use text_compression::ProteinTextSlice; use crate::{ sa_searcher::BoundSearch::{Maximum, Minimum}, suffix_to_protein_index::{DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex}, - Nullable, SuffixArray, + Nullable, SuffixArray }; /// Enum indicating if we are searching for the minimum, or maximum bound in the suffix array #[derive(Clone, Copy, PartialEq)] enum BoundSearch { Minimum, - Maximum, + Maximum } /// Enum representing the minimum and maximum bound of the found matches in the suffix array #[derive(PartialEq, Debug)] pub enum BoundSearchResult { NoMatches, - SearchResult((usize, usize)), + SearchResult((usize, usize)) } /// Enum representing the matching suffixes after searching a peptide in the suffix array @@ -30,7 +30,7 @@ pub enum BoundSearchResult { pub enum SearchAllSuffixesResult { NoMatches, MaxMatches(Vec), - SearchResult(Vec), + SearchResult(Vec) } /// Custom implementation of partialEq for SearchAllSuffixesResult @@ -67,7 +67,7 @@ impl PartialEq for SearchAllSuffixesResult { array_eq_unordered(arr1, arr2) } (SearchAllSuffixesResult::NoMatches, SearchAllSuffixesResult::NoMatches) => true, - _ => false, + _ => false } } } @@ -123,7 +123,7 @@ impl Deref for DenseSearcher { pub struct Searcher { pub sa: SuffixArray, pub proteins: Proteins, - pub suffix_index_to_protein: Box, + pub suffix_index_to_protein: Box } impl Searcher { @@ -172,7 +172,7 @@ impl Searcher { // Depending on if we are searching for the min of max bound our condition is different let condition_check = match bound { Minimum => |a: u8, b: u8| a < b, - Maximum => |a: u8, b: u8| a > b, + Maximum => |a: u8, b: u8| a > b }; // match as long as possible @@ -265,7 +265,7 @@ impl Searcher { match bound { Minimum => (found, right), - Maximum => (found, left), + Maximum => (found, left) } } @@ -307,7 +307,7 @@ impl Searcher { &self, search_string: &[u8], max_matches: usize, - equate_il: bool, + equate_il: bool ) -> SearchAllSuffixesResult { let mut matching_suffixes: Vec = vec![]; let mut il_locations = vec![]; @@ -394,13 +394,9 @@ impl Searcher { il_locations: &[usize], search_string: &[u8], text_slice: ProteinTextSlice, - equate_il: bool, + equate_il: bool ) -> bool { - if equate_il { - true - } else { - text_slice.check_il_locations(skip, il_locations, search_string) - } + if equate_il { true } else { text_slice.check_il_locations(skip, il_locations, search_string) } } /// Returns all the proteins that correspond with the provided suffixes @@ -432,7 +428,7 @@ mod tests { use crate::{ sa_searcher::{BoundSearchResult, SearchAllSuffixesResult, Searcher}, suffix_to_protein_index::SparseSuffixToProtein, - SuffixArray, + SuffixArray }; #[test] @@ -465,24 +461,24 @@ mod tests { Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], + functional_annotations: vec![] }, Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], + functional_annotations: vec![] }, Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], + functional_annotations: vec![] }, Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], + functional_annotations: vec![] }, - ], + ] } } @@ -568,8 +564,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], - }], + functional_annotations: vec![] + }] }; let sparse_sa = SuffixArray::Original(vec![0, 2, 4], 2); @@ -591,8 +587,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], - }], + functional_annotations: vec![] + }] }; let sparse_sa = SuffixArray::Original(vec![6, 0, 1, 5, 4, 3, 2], 1); @@ -613,8 +609,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], - }], + functional_annotations: vec![] + }] }; let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); @@ -635,8 +631,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], - }], + functional_annotations: vec![] + }] }; let sparse_sa = SuffixArray::Original(vec![6, 4, 2, 0], 2); @@ -659,8 +655,8 @@ mod tests { proteins: vec![Protein { uniprot_id: String::new(), taxon_id: 0, - functional_annotations: vec![], - }], + functional_annotations: vec![] + }] }; let sparse_sa = SuffixArray::Original(vec![6, 5, 4, 3, 2, 1, 0], 1); diff --git a/sa-index/src/suffix_to_protein_index.rs b/sa-index/src/suffix_to_protein_index.rs index 1a224d2..a6a4e93 100644 --- a/sa-index/src/suffix_to_protein_index.rs +++ b/sa-index/src/suffix_to_protein_index.rs @@ -1,14 +1,14 @@ use clap::ValueEnum; use sa_mappings::proteins::{SEPARATION_CHARACTER, TERMINATION_CHARACTER}; +use text_compression::ProteinText; use crate::Nullable; -use text_compression::ProteinText; /// Enum used to define the commandline arguments and choose which index style is used #[derive(ValueEnum, Clone, Debug, PartialEq)] pub enum SuffixToProteinMappingStyle { Dense, - Sparse, + Sparse } /// Trait implemented by the SuffixToProtein mappings @@ -29,14 +29,14 @@ pub trait SuffixToProteinIndex: Send + Sync { #[derive(Debug, PartialEq)] pub struct DenseSuffixToProtein { // UniProtKB does not have more that u32::MAX proteins, so a larger type is not needed - mapping: Vec, + mapping: Vec } /// Mapping that uses O(m) memory with m the number of proteins, but retrieval of the protein is /// O(log m) #[derive(Debug, PartialEq)] pub struct SparseSuffixToProtein { - mapping: Vec, + mapping: Vec } impl SuffixToProteinIndex for DenseSuffixToProtein { @@ -113,9 +113,9 @@ mod tests { use crate::{ suffix_to_protein_index::{ - DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle, + DenseSuffixToProtein, SparseSuffixToProtein, SuffixToProteinIndex, SuffixToProteinMappingStyle }, - Nullable, + Nullable }; fn build_text() -> ProteinText { @@ -138,7 +138,7 @@ mod tests { let u8_text = &build_text(); let index = DenseSuffixToProtein::new(u8_text); let expected = DenseSuffixToProtein { - mapping: vec![0, 0, 0, u32::NULL, 1, 1, u32::NULL, 2, 2, 2, u32::NULL], + mapping: vec![0, 0, 0, u32::NULL, 1, 1, u32::NULL, 2, 2, 2, u32::NULL] }; assert_eq!(index, expected); } diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index 9285980..53e52b8 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -23,7 +23,7 @@ pub struct Protein { pub taxon_id: u32, /// The encoded functional annotations of the protein - pub functional_annotations: Vec, + pub functional_annotations: Vec } /// A struct that represents a collection of proteins @@ -32,7 +32,7 @@ pub struct Proteins { pub text: ProteinText, /// The proteins in the input string - pub proteins: Vec, + pub proteins: Vec } impl Protein { @@ -80,7 +80,7 @@ impl Proteins { proteins.push(Protein { uniprot_id: uniprot_id.to_string(), taxon_id, - functional_annotations, + functional_annotations }); } @@ -195,7 +195,7 @@ mod tests { .unwrap(); file.write( "P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\tGO:0009279;IPR:IPR016364;IPR:IPR008816\n" - .as_bytes(), + .as_bytes() ) .unwrap(); @@ -207,7 +207,7 @@ mod tests { let protein = Protein { uniprot_id: "P12345".to_string(), taxon_id: 1, - functional_annotations: vec![0xD1, 0x11], + functional_annotations: vec![0xD1, 0x11] }; assert_eq!(protein.uniprot_id, "P12345"); @@ -225,14 +225,14 @@ mod tests { Protein { uniprot_id: "P12345".to_string(), taxon_id: 1, - functional_annotations: vec![0xD1, 0x11], + functional_annotations: vec![0xD1, 0x11] }, Protein { uniprot_id: "P54321".to_string(), taxon_id: 2, - functional_annotations: vec![0xD1, 0x11], + functional_annotations: vec![0xD1, 0x11] }, - ], + ] }; assert_eq!(proteins.proteins.len(), 2); diff --git a/sa-server/src/main.rs b/sa-server/src/main.rs index c65ba7c..5284546 100644 --- a/sa-server/src/main.rs +++ b/sa-server/src/main.rs @@ -2,14 +2,14 @@ use std::{ error::Error, fs::File, io::{BufReader, Read}, - sync::Arc, + sync::Arc }; use axum::{ extract::{DefaultBodyLimit, State}, http::StatusCode, routing::post, - Json, Router, + Json, Router }; use clap::Parser; use sa_compression::load_compressed_suffix_array; @@ -17,7 +17,7 @@ use sa_index::{ binary::load_suffix_array, peptide_search::{search_all_peptides, SearchResult}, sa_searcher::SparseSearcher, - SuffixArray, + SuffixArray }; use sa_mappings::proteins::Proteins; use serde::Deserialize; @@ -30,7 +30,7 @@ pub struct Arguments { #[arg(short, long)] database_file: String, #[arg(short, long)] - index_file: String, + index_file: String } /// Function used by serde to place a default value in the cutoff field of the input @@ -58,7 +58,7 @@ struct InputData { cutoff: usize, #[serde(default = "bool::default")] // default value is false // TODO: maybe default should be true? - equate_il: bool, + equate_il: bool } #[tokio::main] @@ -81,7 +81,7 @@ async fn main() { /// Returns the search results from the index as a JSON async fn search( State(searcher): State>, - data: Json, + data: Json ) -> Result>, StatusCode> { let search_result = search_all_peptides(&searcher, &data.peptides, data.cutoff, data.equate_il); diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index dc7f71e..85e93b3 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -1,7 +1,7 @@ -use std::collections::HashMap; use std::{ + collections::HashMap, error::Error, - io::{BufRead, Write}, + io::{BufRead, Write} }; use bitarray::{data_to_writer, Binary, BitArray}; @@ -13,7 +13,7 @@ pub struct ProteinText { /// Hashmap storing the mapping between the character as `u8` and a 5 bit number. char_to_5bit: HashMap, /// Vector storing the mapping between the 5 bit number and the character as `u8`. - bit5_to_char: Vec, + bit5_to_char: Vec } impl ProteinText { @@ -184,7 +184,7 @@ pub struct ProteinTextSlice<'a> { /// The start of the slice. start: usize, // included /// The end of the slice. - end: usize, // excluded + end: usize // excluded } impl<'a> ProteinTextSlice<'a> { @@ -282,13 +282,13 @@ impl<'a> ProteinTextSlice<'a> { /// Structure representing an iterator over a `ProteinText` instance, iterating the characters of the text. pub struct ProteinTextIterator<'a> { protein_text: &'a ProteinText, - index: usize, + index: usize } /// Structure representing an iterator over a `ProteintextSlice` instance, iterating the characters of the slice. pub struct ProteinTextSliceIterator<'a> { text_slice: &'a ProteinTextSlice<'a>, - index: usize, + index: usize } impl<'a> Iterator for ProteinTextSliceIterator<'a> { @@ -394,7 +394,7 @@ mod tests { pub struct FailingWriter { /// The number of times the write function can be called before it fails. - pub valid_write_count: usize, + pub valid_write_count: usize } impl Write for FailingWriter { @@ -414,7 +414,7 @@ mod tests { pub struct FailingReader { /// The number of times the read function can be called before it fails. - pub valid_read_count: usize, + pub valid_read_count: usize } impl Read for FailingReader { @@ -545,15 +545,12 @@ mod tests { let mut writer = vec![]; dump_compressed_text(text, &mut writer).unwrap(); - assert_eq!( - writer, - vec![ - // bits per value - 5, // size of the text - 10, 0, 0, 0, 0, 0, 0, 0, // compressed text - 0, 128, 74, 232, 152, 66, 134, 8 - ] - ); + assert_eq!(writer, vec![ + // bits per value + 5, // size of the text + 10, 0, 0, 0, 0, 0, 0, 0, // compressed text + 0, 128, 74, 232, 152, 66, 134, 8 + ]); } #[test] From b145e776e53d81e146d9bcb606834299f1b44d1e Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 16:29:17 +0200 Subject: [PATCH 23/27] fix cargo clippy errors --- bitarray/src/lib.rs | 2 +- sa-index/src/sa_searcher.rs | 10 +++++----- text-compression/src/lib.rs | 6 +++++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index e4bd8a2..78b6c60 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -85,7 +85,7 @@ impl BitArray { /// * `index` - The index of the value to set. /// * `value` - The value to set at the specified index. pub fn set(&mut self, index: usize, value: u64) { - let value: u64 = value.into(); + let value: u64 = value; let start_block = index * self.bits_per_value / 64; let start_block_offset = index * self.bits_per_value % 64; diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 03abf07..f709fd6 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -178,11 +178,11 @@ impl Searcher { // match as long as possible while index_in_search_string < search_string.len() && index_in_suffix < self.proteins.text.len() - && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix) as u8 + && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix) || (search_string[index_in_search_string] == b'L' - && self.proteins.text.get(index_in_suffix) as u8 == b'I') + && self.proteins.text.get(index_in_suffix) == b'I') || (search_string[index_in_search_string] == b'I' - && self.proteins.text.get(index_in_suffix) as u8 == b'L')) + && self.proteins.text.get(index_in_suffix) == b'L')) { index_in_suffix += 1; index_in_search_string += 1; @@ -201,10 +201,10 @@ impl Searcher { search_string[index_in_search_string] }; - let protein_char = if self.proteins.text.get(index_in_suffix) as u8 == b'L' { + let protein_char = if self.proteins.text.get(index_in_suffix) == b'L' { b'I' } else { - self.proteins.text.get(index_in_suffix) as u8 + self.proteins.text.get(index_in_suffix) }; is_cond_or_equal = condition_check(peptide_char, protein_char); diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index 85e93b3..cac302f 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -73,7 +73,7 @@ impl ProteinText { /// # Returns /// /// An instance of `ProteinText` - pub fn from_vec(input_vec: &Vec) -> ProteinText { + pub fn from_vec(input_vec: &[u8]) -> ProteinText { let char_to_5bit = ProteinText::create_char_to_5bit_hashmap(); let bit5_to_char = ProteinText::create_bit5_to_char(); @@ -223,6 +223,10 @@ impl<'a> ProteinTextSlice<'a> { self.end - self.start } + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Checks if the slice and a given array of `u8` are equal. /// I and L can be equated. /// From 3623691e8f7fc0a359fc416c0c86532f0846478a Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 16:30:41 +0200 Subject: [PATCH 24/27] reformat with cargo --- sa-index/src/sa_searcher.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index f709fd6..e9590c8 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -179,10 +179,8 @@ impl Searcher { while index_in_search_string < search_string.len() && index_in_suffix < self.proteins.text.len() && (search_string[index_in_search_string] == self.proteins.text.get(index_in_suffix) - || (search_string[index_in_search_string] == b'L' - && self.proteins.text.get(index_in_suffix) == b'I') - || (search_string[index_in_search_string] == b'I' - && self.proteins.text.get(index_in_suffix) == b'L')) + || (search_string[index_in_search_string] == b'L' && self.proteins.text.get(index_in_suffix) == b'I') + || (search_string[index_in_search_string] == b'I' && self.proteins.text.get(index_in_suffix) == b'L')) { index_in_suffix += 1; index_in_search_string += 1; From 76ca5da9fc2e813f06be530b4889c0353d1646fa Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 16:33:48 +0200 Subject: [PATCH 25/27] fix to long doc comment --- bitarray/src/lib.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 78b6c60..06235d1 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -149,9 +149,8 @@ impl BitArray { } } -/// Writes the data to a writer in a binary format using a bit array. This function is helpfull -/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the -/// specified capacity, so memory usage is minimized. +/// Writes the data to a writer in a binary format using a bit array. The data is written +/// in chunks of the specified capacity, so memory usage is minimized. /// /// # Arguments /// From 1c440127b88d19400a009afb2d87ff90dc55a5eb Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 16:35:12 +0200 Subject: [PATCH 26/27] reformat with cargo --- bitarray/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitarray/src/lib.rs b/bitarray/src/lib.rs index 06235d1..901b395 100644 --- a/bitarray/src/lib.rs +++ b/bitarray/src/lib.rs @@ -149,7 +149,7 @@ impl BitArray { } } -/// Writes the data to a writer in a binary format using a bit array. The data is written +/// Writes the data to a writer in a binary format using a bit array. The data is written /// in chunks of the specified capacity, so memory usage is minimized. /// /// # Arguments From 79bee50349dc0cffc5213546153713132401c226 Mon Sep 17 00:00:00 2001 From: SimonVandeVyver Date: Thu, 12 Sep 2024 16:39:42 +0200 Subject: [PATCH 27/27] add lifetime parameter to ProteinTextSlice --- text-compression/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index cac302f..4866a6c 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -198,7 +198,7 @@ impl<'a> ProteinTextSlice<'a> { /// # Returns /// /// An instance of `ProteinTextSlice` - pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice { + pub fn new(text: &'a ProteinText, start: usize, end: usize) -> ProteinTextSlice<'a> { Self { text, start, end } }