use uncompressed text for SA construction

unipept · Sep 11, 2024 · e476461 · e476461
1 parent b0a804d
commit e476461
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 31 deletions.
diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs
@@ -21,7 +21,7 @@ fn main() {
     eprintln!();
     eprintln!("📋 Started loading the proteins...");
     let start_proteins_time = get_time_ms().unwrap();
-    let mut data = Proteins::try_from_database_file_without_annotations(&database_file)
+    let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
         .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
     eprintln!(
         "✅ Successfully loaded the proteins in {} seconds!",

diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs
@@ -342,7 +342,7 @@ impl Searcher {
                     if suffix >= skip
                         && ((skip == 0
                             || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix)
-                                .equals_slice(current_search_string_prefix, equate_il))
+                                .equals_slice(current_search_string_prefix, equate_il)) // Check the prefix
                             && 
                             Self::check_suffix(
                                 skip,
@@ -372,31 +372,6 @@ impl Searcher {
         }
     }
 
-    /// Returns true of the prefixes are the same
-    /// if `equate_il` is set to true, L and I are considered the same
-    ///
-    /// # Arguments
-    /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched
-    /// * `index_prefix` - The unchecked prefix from the protein from the suffix array
-    /// * `equate_il` - True if we want to equate I and L during search, otherwise false
-    ///
-    /// # Returns
-    ///
-    /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise
-    /// false
-    #[inline]
-    fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool {
-        if equate_il {
-            search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| {
-                search_character == index_character
-                    || (search_character == b'I' && index_character == b'L')
-                    || (search_character == b'L' && index_character == b'I')
-            })
-        } else {
-            search_string_prefix == index_prefix
-        }
-    }
-
     /// Returns true of the search_string and index_string are equal
     /// This is automatically true if `equate_il` is set to true, since there matched during
     /// search where I = L If `equate_il` is set to false, we need to check if the I and

diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs
@@ -47,7 +47,6 @@ impl Proteins {
     ///
     /// # Arguments
     /// * `file` - The path to the database file
-    /// * `taxon_aggregator` - The `TaxonAggregator` to use
     ///
     /// # Returns
     ///
@@ -97,7 +96,6 @@ impl Proteins {
     ///
     /// # Arguments
     /// * `file` - The path to the database file
-    /// * `taxon_aggregator` - The `TaxonAggregator` to use
     ///
     /// # Returns
     ///
@@ -130,6 +128,45 @@ impl Proteins {
         Ok(text)
 
     }
+
+    /// Creates a `vec<u8>` which represents all the proteins concatenated from the database file
+    ///
+    /// # Arguments
+    /// * `file` - The path to the database file
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Result` containing the `Vec<u8>`
+    ///
+    /// # Errors
+    ///
+    /// Returns a `Box<dyn Error>` if an error occurred while reading the database file
+    pub fn try_from_database_file_uncompressed(database_file: &str) -> Result<Vec<u8>, Box<dyn Error>> {
+        let mut input_string: String = String::new();
+
+        let file = File::open(database_file)?;
+
+        // Read the lines as bytes, since the input string is not guaranteed to be utf8
+        // because of the encoded functional annotations
+        let mut lines = ByteLines::new(BufReader::new(file));
+
+        while let Some(Ok(line)) = lines.next() {
+            let mut fields = line.split(|b| *b == b'\t');
+
+            // only get the taxon id and sequence from each line, we don't need the other parts
+            let sequence = from_utf8(fields.nth(2).unwrap())?;
+
+            input_string.push_str(&sequence.to_uppercase());
+            input_string.push(SEPARATION_CHARACTER.into());
+        }
+
+        input_string.pop();
+        input_string.push(TERMINATION_CHARACTER.into());
+
+        input_string.shrink_to_fit();
+        Ok(input_string.into_bytes())
+
+    }
 }
 
 impl Index<usize> for Proteins {

diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs
@@ -351,8 +351,8 @@ mod tests {
         let mut reader = std::io::BufReader::new(&data[..]);
         let compressed_text = load_compressed_text(&mut reader).unwrap();
 
-        for i in 0..10 {
-            assert_eq!(compressed_text.get(i), i as u8 + 1);
+        for (i, c) in "CDEFGHIKLM".chars().enumerate() {
+            assert_eq!(compressed_text.get(i), c as u8);
         }
     }