diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs index 652f775..18f4410 100644 --- a/sa-mappings/src/functionality.rs +++ b/sa-mappings/src/functionality.rs @@ -1,8 +1,19 @@ +//! This module contains the FunctionAggregator struct that is responsible for aggregating the functional annotations of proteins. + use crate::proteins::Protein; +/// A struct that represents a function aggregator pub struct FunctionAggregator {} impl FunctionAggregator { + /// Aggregates the functional annotations of proteins + /// + /// # Arguments + /// * `proteins` - A vector of proteins + /// + /// # Returns + /// + /// Returns a string containing the aggregated functional annotations pub fn aggregate(&self, proteins: Vec) -> String { proteins .iter() diff --git a/sa-mappings/src/lib.rs b/sa-mappings/src/lib.rs index 7b0545d..68fc8bd 100644 --- a/sa-mappings/src/lib.rs +++ b/sa-mappings/src/lib.rs @@ -1,3 +1,8 @@ +//! This library provides functionality to map protein sequences to their respective taxonomic identifiers +//! and functional annotations. + +#![warn(missing_docs)] + pub mod proteins; pub mod taxonomy; pub mod functionality; diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index decf05e..d209ef4 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -1,14 +1,21 @@ +//! This module contains the `Protein` and `Proteins` structs, which are used to represent proteins and collections of proteins, respectively. + use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8}; use bytelines::ByteLines; -use fa_compression::decode; +use fa_compression::algorithm1::decode; use umgap::taxon::TaxonId; use crate::taxonomy::TaxonAggregator; +/// The separation character used in the input string pub static SEPARATION_CHARACTER: u8 = b'-'; + +/// The termination character used in the input string +/// This character should be smaller than the separation character pub static TERMINATION_CHARACTER: u8 = b'$'; +/// A struct that represents a protein and its linked information #[derive(Debug)] pub struct Protein { /// The id of the protein @@ -24,6 +31,7 @@ pub struct Protein { pub functional_annotations: Vec, } +/// A struct that represents a collection of proteins #[derive(Debug)] pub struct Proteins { /// The input string containing all proteins @@ -34,12 +42,26 @@ pub struct Proteins { } impl Protein { + /// Returns the decoded functional annotations of the protein pub fn get_functional_annotations(&self) -> String { decode(&self.functional_annotations) } } impl Proteins { + /// Creates a new `Proteins` struct from a database file and a `TaxonAggregator` + /// + /// # Arguments + /// * `file` - The path to the database file + /// * `taxon_aggregator` - The `TaxonAggregator` to use + /// + /// # Returns + /// + /// Returns a `Result` containing the `Proteins` struct + /// + /// # Errors + /// + /// Returns a `Box` if an error occurred while reading the database file pub fn try_from_database_file(file: &str, taxon_aggregator: &TaxonAggregator) -> Result> { let mut input_string: String = String::new(); let mut proteins: Vec = Vec::new(); @@ -48,10 +70,14 @@ impl Proteins { let mut start_index = 0; + // Read the lines as bytes, since the input string is not guaranteed to be utf8 + // because of the encoded functional annotations let mut lines = ByteLines::new(BufReader::new(file)); while let Some(Ok(line)) = lines.next() { let mut fields = line.split(|b| *b == b'\t'); + + // uniprot_id, taxon_id and sequence should always contain valid utf8 let uniprot_id = from_utf8(fields.next().unwrap())?; let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; let sequence = from_utf8(fields.next().unwrap())?; @@ -80,11 +106,20 @@ impl Proteins { Ok(Self { input_string: input_string.into_bytes(), proteins }) } + /// Returns the sequence of a protein + /// + /// # Arguments + /// * `protein` - The protein to get the sequence from + /// + /// # Returns + /// + /// Returns a string slice containing the sequence of the protein pub fn get_sequence(&self, protein: &Protein) -> &str { let (start, length) = protein.sequence; let end = start + length as usize; - std::str::from_utf8(&self.input_string[start..end]).unwrap() // should never fail since the input string will always be utf8 + // unwrap should never fail since the input string will always be utf8 + std::str::from_utf8(&self.input_string[start..end]).unwrap() } } @@ -102,7 +137,7 @@ mod tests { use std::io::Write; use std::path::PathBuf; - use fa_compression::decode; + use fa_compression::algorithm1::decode; use tempdir::TempDir; use crate::taxonomy::AggregationMethod; diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs index 8c5e968..cc364e1 100644 --- a/sa-mappings/src/taxonomy.rs +++ b/sa-mappings/src/taxonomy.rs @@ -1,19 +1,46 @@ +//! This module provides a `TaxonAggregator` struct that is used to aggregate taxonomic information. +//! It uses a taxonomy file to create a taxonomic tree and performs aggregation using different methods. + use std::error::Error; use umgap::{agg::{count, MultiThreadSafeAggregator}, rmq::{lca::LCACalculator, mix::MixCalculator}, taxon::{read_taxa_file, TaxonId, TaxonList, TaxonTree}}; +/// A struct that represents a taxon aggregator. pub struct TaxonAggregator { + /// A vector that contains the snapped taxon IDs. snapping: Vec>, + + /// The aggregator used to aggregate taxon IDs. aggregator: Box, + + /// The taxon list. taxon_list: TaxonList } +/// An enum that specifies the aggregation method to use. pub enum AggregationMethod { + /// The Lowest Common Ancestor (LCA) aggregation method. Lca, + + /// The LCA* aggregation method. LcaStar } impl TaxonAggregator { + /// Creates a new `TaxonAggregator` from a taxonomy file and an aggregation method. + /// + /// # Arguments + /// + /// * `file` - A string slice that represents the path to the taxonomy file. + /// * `method` - An `AggregationMethod` enum that specifies the aggregation method to use. + /// + /// # Returns + /// + /// Returns a `Result` containing the `TaxonAggregator` + /// + /// # Errors + /// + /// Returns a `Box` if an error occurred while reading the taxonomy file. pub fn try_from_taxonomy_file(file: &str, method: AggregationMethod) -> Result> { let taxons = read_taxa_file(file)?; let taxon_tree = TaxonTree::new(&taxons); @@ -28,14 +55,41 @@ impl TaxonAggregator { Ok(Self { snapping, aggregator, taxon_list }) } + /// Checks if a taxon exists in the taxon list. + /// + /// # Arguments + /// + /// * `taxon` - The taxon ID to check. + /// + /// # Returns + /// + /// Returns a boolean value indicating whether the taxon exists in the taxon list. pub fn taxon_exists(&self, taxon: TaxonId) -> bool { self.taxon_list.get(taxon).is_some() } + /// Snaps a taxon to its closest ancestor in the taxonomic tree. + /// + /// # Arguments + /// + /// * `taxon` - The taxon ID to snap. + /// + /// # Returns + /// + /// Returns the snapped taxon ID, or panics if the taxon cannot be snapped. pub fn snap_taxon(&self, taxon: TaxonId) -> TaxonId { self.snapping[taxon].unwrap_or_else(|| panic!("Could not snap taxon with id {taxon}")) } + /// Aggregates a list of taxon IDs using the specified aggregation method. + /// + /// # Arguments + /// + /// * `taxa` - A vector of taxon IDs to aggregate. + /// + /// # Returns + /// + /// Returns the aggregated taxon ID, or panics if aggregation fails. pub fn aggregate(&self, taxa: Vec) -> TaxonId { let count = count(taxa.into_iter().map(|t| (t, 1.0))); self.aggregator.aggregate(&count).unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count))