diff --git a/sa-builder/src/main.rs b/sa-builder/src/main.rs index 98a1414..01cc3c4 100644 --- a/sa-builder/src/main.rs +++ b/sa-builder/src/main.rs @@ -21,7 +21,7 @@ fn main() { eprintln!(); eprintln!("📋 Started loading the proteins..."); let start_proteins_time = get_time_ms().unwrap(); - let mut data = Proteins::try_from_database_file_without_annotations(&database_file) + let mut data = Proteins::try_from_database_file_uncompressed(&database_file) .unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str())); eprintln!( "✅ Successfully loaded the proteins in {} seconds!", diff --git a/sa-index/src/sa_searcher.rs b/sa-index/src/sa_searcher.rs index 2324046..4f4522e 100644 --- a/sa-index/src/sa_searcher.rs +++ b/sa-index/src/sa_searcher.rs @@ -342,7 +342,7 @@ impl Searcher { if suffix >= skip && ((skip == 0 || ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix) - .equals_slice(current_search_string_prefix, equate_il)) + .equals_slice(current_search_string_prefix, equate_il)) // Check the prefix && Self::check_suffix( skip, @@ -372,31 +372,6 @@ impl Searcher { } } - /// Returns true of the prefixes are the same - /// if `equate_il` is set to true, L and I are considered the same - /// - /// # Arguments - /// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched - /// * `index_prefix` - The unchecked prefix from the protein from the suffix array - /// * `equate_il` - True if we want to equate I and L during search, otherwise false - /// - /// # Returns - /// - /// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise - /// false - #[inline] - fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool { - if equate_il { - search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| { - search_character == index_character - || (search_character == b'I' && index_character == b'L') - || (search_character == b'L' && index_character == b'I') - }) - } else { - search_string_prefix == index_prefix - } - } - /// Returns true of the search_string and index_string are equal /// This is automatically true if `equate_il` is set to true, since there matched during /// search where I = L If `equate_il` is set to false, we need to check if the I and diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index ca3bdd7..626ead3 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -47,7 +47,6 @@ impl Proteins { /// /// # Arguments /// * `file` - The path to the database file - /// * `taxon_aggregator` - The `TaxonAggregator` to use /// /// # Returns /// @@ -97,7 +96,6 @@ impl Proteins { /// /// # Arguments /// * `file` - The path to the database file - /// * `taxon_aggregator` - The `TaxonAggregator` to use /// /// # Returns /// @@ -130,6 +128,45 @@ impl Proteins { Ok(text) } + + /// Creates a `vec` which represents all the proteins concatenated from the database file + /// + /// # Arguments + /// * `file` - The path to the database file + /// + /// # Returns + /// + /// Returns a `Result` containing the `Vec` + /// + /// # Errors + /// + /// Returns a `Box` if an error occurred while reading the database file + pub fn try_from_database_file_uncompressed(database_file: &str) -> Result, Box> { + let mut input_string: String = String::new(); + + let file = File::open(database_file)?; + + // Read the lines as bytes, since the input string is not guaranteed to be utf8 + // because of the encoded functional annotations + let mut lines = ByteLines::new(BufReader::new(file)); + + while let Some(Ok(line)) = lines.next() { + let mut fields = line.split(|b| *b == b'\t'); + + // only get the taxon id and sequence from each line, we don't need the other parts + let sequence = from_utf8(fields.nth(2).unwrap())?; + + input_string.push_str(&sequence.to_uppercase()); + input_string.push(SEPARATION_CHARACTER.into()); + } + + input_string.pop(); + input_string.push(TERMINATION_CHARACTER.into()); + + input_string.shrink_to_fit(); + Ok(input_string.into_bytes()) + + } } impl Index for Proteins { diff --git a/text-compression/src/lib.rs b/text-compression/src/lib.rs index b090826..871de5b 100644 --- a/text-compression/src/lib.rs +++ b/text-compression/src/lib.rs @@ -351,8 +351,8 @@ mod tests { let mut reader = std::io::BufReader::new(&data[..]); let compressed_text = load_compressed_text(&mut reader).unwrap(); - for i in 0..10 { - assert_eq!(compressed_text.get(i), i as u8 + 1); + for (i, c) in "CDEFGHIKLM".chars().enumerate() { + assert_eq!(compressed_text.get(i), c as u8); } }