Skip to content

Commit

Permalink
use uncompressed text for SA construction
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonVandeVyver committed Sep 11, 2024
1 parent b0a804d commit e476461
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 31 deletions.
2 changes: 1 addition & 1 deletion sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fn main() {
eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
let mut data = Proteins::try_from_database_file_without_annotations(&database_file)
let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
Expand Down
27 changes: 1 addition & 26 deletions sa-index/src/sa_searcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ impl Searcher {
if suffix >= skip
&& ((skip == 0
|| ProteinTextSlice::new(&self.proteins.text, suffix - skip, suffix)
.equals_slice(current_search_string_prefix, equate_il))
.equals_slice(current_search_string_prefix, equate_il)) // Check the prefix
&&
Self::check_suffix(
skip,
Expand Down Expand Up @@ -372,31 +372,6 @@ impl Searcher {
}
}

/// Returns true of the prefixes are the same
/// if `equate_il` is set to true, L and I are considered the same
///
/// # Arguments
/// * `search_string_prefix` - The unchecked prefix of the string/peptide that is searched
/// * `index_prefix` - The unchecked prefix from the protein from the suffix array
/// * `equate_il` - True if we want to equate I and L during search, otherwise false
///
/// # Returns
///
/// Returns true if `search_string_prefix` and `index_prefix` are considered the same, otherwise
/// false
#[inline]
fn check_prefix(search_string_prefix: &[u8], index_prefix: &[u8], equate_il: bool) -> bool {
if equate_il {
search_string_prefix.iter().zip(index_prefix).all(|(&search_character, &index_character)| {
search_character == index_character
|| (search_character == b'I' && index_character == b'L')
|| (search_character == b'L' && index_character == b'I')
})
} else {
search_string_prefix == index_prefix
}
}

/// Returns true of the search_string and index_string are equal
/// This is automatically true if `equate_il` is set to true, since there matched during
/// search where I = L If `equate_il` is set to false, we need to check if the I and
Expand Down
41 changes: 39 additions & 2 deletions sa-mappings/src/proteins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ impl Proteins {
///
/// # Arguments
/// * `file` - The path to the database file
/// * `taxon_aggregator` - The `TaxonAggregator` to use
///
/// # Returns
///
Expand Down Expand Up @@ -97,7 +96,6 @@ impl Proteins {
///
/// # Arguments
/// * `file` - The path to the database file
/// * `taxon_aggregator` - The `TaxonAggregator` to use
///
/// # Returns
///
Expand Down Expand Up @@ -130,6 +128,45 @@ impl Proteins {
Ok(text)

}

/// Creates a `vec<u8>` which represents all the proteins concatenated from the database file
///
/// # Arguments
/// * `file` - The path to the database file
///
/// # Returns
///
/// Returns a `Result` containing the `Vec<u8>`
///
/// # Errors
///
/// Returns a `Box<dyn Error>` if an error occurred while reading the database file
pub fn try_from_database_file_uncompressed(database_file: &str) -> Result<Vec<u8>, Box<dyn Error>> {
let mut input_string: String = String::new();

let file = File::open(database_file)?;

// Read the lines as bytes, since the input string is not guaranteed to be utf8
// because of the encoded functional annotations
let mut lines = ByteLines::new(BufReader::new(file));

while let Some(Ok(line)) = lines.next() {
let mut fields = line.split(|b| *b == b'\t');

// only get the taxon id and sequence from each line, we don't need the other parts
let sequence = from_utf8(fields.nth(2).unwrap())?;

input_string.push_str(&sequence.to_uppercase());
input_string.push(SEPARATION_CHARACTER.into());
}

input_string.pop();
input_string.push(TERMINATION_CHARACTER.into());

input_string.shrink_to_fit();
Ok(input_string.into_bytes())

}
}

impl Index<usize> for Proteins {
Expand Down
4 changes: 2 additions & 2 deletions text-compression/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,8 @@ mod tests {
let mut reader = std::io::BufReader::new(&data[..]);
let compressed_text = load_compressed_text(&mut reader).unwrap();

for i in 0..10 {
assert_eq!(compressed_text.get(i), i as u8 + 1);
for (i, c) in "CDEFGHIKLM".chars().enumerate() {
assert_eq!(compressed_text.get(i), c as u8);
}
}

Expand Down

0 comments on commit e476461

Please sign in to comment.