provide documentation

unipept · Apr 8, 2024 · 8fa2882 · 8fa2882
1 parent 1580075
commit 8fa2882
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 3 deletions.
diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs
@@ -1,8 +1,19 @@
+//! This module contains the FunctionAggregator struct that is responsible for aggregating the functional annotations of proteins.
+
 use crate::proteins::Protein;
 
+/// A struct that represents a function aggregator
 pub struct FunctionAggregator {}
 
 impl FunctionAggregator {
+    /// Aggregates the functional annotations of proteins
+    /// 
+    /// # Arguments
+    /// * `proteins` - A vector of proteins
+    /// 
+    /// # Returns
+    /// 
+    /// Returns a string containing the aggregated functional annotations
     pub fn aggregate(&self, proteins: Vec<Protein>) -> String {
         proteins
             .iter()

diff --git a/sa-mappings/src/lib.rs b/sa-mappings/src/lib.rs
@@ -1,3 +1,8 @@
+//! This library provides functionality to map protein sequences to their respective taxonomic identifiers
+//! and functional annotations.
+
+#![warn(missing_docs)]
+
 pub mod proteins;
 pub mod taxonomy;
 pub mod functionality;
diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs
@@ -1,14 +1,21 @@
+//! This module contains the `Protein` and `Proteins` structs, which are used to represent proteins and collections of proteins, respectively.
+
 use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8};
 
 use bytelines::ByteLines;
-use fa_compression::decode;
+use fa_compression::algorithm1::decode;
 use umgap::taxon::TaxonId;
 
 use crate::taxonomy::TaxonAggregator;
 
+/// The separation character used in the input string
 pub static SEPARATION_CHARACTER: u8 = b'-';
+
+/// The termination character used in the input string
+/// This character should be smaller than the separation character
 pub static TERMINATION_CHARACTER: u8 = b'$';
 
+/// A struct that represents a protein and its linked information
 #[derive(Debug)]
 pub struct Protein {
     /// The id of the protein
@@ -24,6 +31,7 @@ pub struct Protein {
     pub functional_annotations: Vec<u8>,
 }
 
+/// A struct that represents a collection of proteins
 #[derive(Debug)]
 pub struct Proteins {
     /// The input string containing all proteins
@@ -34,12 +42,26 @@ pub struct Proteins {
 }
 
 impl Protein {
+    /// Returns the decoded functional annotations of the protein
     pub fn get_functional_annotations(&self) -> String {
         decode(&self.functional_annotations)
     }
 }
 
 impl Proteins {
+    /// Creates a new `Proteins` struct from a database file and a `TaxonAggregator`
+    /// 
+    /// # Arguments
+    /// * `file` - The path to the database file
+    /// * `taxon_aggregator` - The `TaxonAggregator` to use
+    /// 
+    /// # Returns
+    /// 
+    /// Returns a `Result` containing the `Proteins` struct
+    /// 
+    /// # Errors
+    /// 
+    /// Returns a `Box<dyn Error>` if an error occurred while reading the database file
     pub fn try_from_database_file(file: &str, taxon_aggregator: &TaxonAggregator) -> Result<Self, Box<dyn Error>> {
         let mut input_string: String = String::new();
         let mut proteins: Vec<Protein> = Vec::new();
@@ -48,10 +70,14 @@ impl Proteins {
 
         let mut start_index = 0;
 
+        // Read the lines as bytes, since the input string is not guaranteed to be utf8
+        // because of the encoded functional annotations
         let mut lines = ByteLines::new(BufReader::new(file));
 
         while let Some(Ok(line)) = lines.next() {
             let mut fields = line.split(|b| *b == b'\t');
+
+            // uniprot_id, taxon_id and sequence should always contain valid utf8
             let uniprot_id = from_utf8(fields.next().unwrap())?;
             let taxon_id = from_utf8(fields.next().unwrap())?.parse::<TaxonId>()?;
             let sequence = from_utf8(fields.next().unwrap())?;
@@ -80,11 +106,20 @@ impl Proteins {
         Ok(Self { input_string: input_string.into_bytes(), proteins })
     }
 
+    /// Returns the sequence of a protein
+    ///
+    /// # Arguments
+    /// * `protein` - The protein to get the sequence from
+    /// 
+    /// # Returns
+    /// 
+    /// Returns a string slice containing the sequence of the protein
     pub fn get_sequence(&self, protein: &Protein) -> &str {
         let (start, length) = protein.sequence;
         let end = start + length as usize;
 
-        std::str::from_utf8(&self.input_string[start..end]).unwrap() // should never fail since the input string will always be utf8
+        // unwrap should never fail since the input string will always be utf8
+        std::str::from_utf8(&self.input_string[start..end]).unwrap()
     }
 }
 
@@ -102,7 +137,7 @@ mod tests {
     use std::io::Write;
     use std::path::PathBuf;
 
-    use fa_compression::decode;
+    use fa_compression::algorithm1::decode;
     use tempdir::TempDir;
 
     use crate::taxonomy::AggregationMethod;

diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs
@@ -1,19 +1,46 @@
+//! This module provides a `TaxonAggregator` struct that is used to aggregate taxonomic information.
+//! It uses a taxonomy file to create a taxonomic tree and performs aggregation using different methods.
+
 use std::error::Error;
 
 use umgap::{agg::{count, MultiThreadSafeAggregator}, rmq::{lca::LCACalculator, mix::MixCalculator}, taxon::{read_taxa_file, TaxonId, TaxonList, TaxonTree}};
 
+/// A struct that represents a taxon aggregator.
 pub struct TaxonAggregator {
+    /// A vector that contains the snapped taxon IDs.
     snapping: Vec<Option<TaxonId>>,
+
+    /// The aggregator used to aggregate taxon IDs.
     aggregator: Box<dyn MultiThreadSafeAggregator>,
+
+    /// The taxon list.
     taxon_list: TaxonList
 }
 
+/// An enum that specifies the aggregation method to use.
 pub enum AggregationMethod {
+    /// The Lowest Common Ancestor (LCA) aggregation method.
     Lca,
+
+    /// The LCA* aggregation method.
     LcaStar
 }
 
 impl TaxonAggregator {
+    /// Creates a new `TaxonAggregator` from a taxonomy file and an aggregation method.
+    ///
+    /// # Arguments
+    ///
+    /// * `file` - A string slice that represents the path to the taxonomy file.
+    /// * `method` - An `AggregationMethod` enum that specifies the aggregation method to use.
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Result` containing the `TaxonAggregator`
+    /// 
+    /// # Errors
+    /// 
+    /// Returns a `Box<dyn Error>` if an error occurred while reading the taxonomy file.
     pub fn try_from_taxonomy_file(file: &str, method: AggregationMethod) -> Result<Self, Box<dyn Error>> {
         let taxons = read_taxa_file(file)?;
         let taxon_tree = TaxonTree::new(&taxons);
@@ -28,14 +55,41 @@ impl TaxonAggregator {
         Ok(Self { snapping, aggregator, taxon_list })
     }
 
+    /// Checks if a taxon exists in the taxon list.
+    ///
+    /// # Arguments
+    ///
+    /// * `taxon` - The taxon ID to check.
+    ///
+    /// # Returns
+    ///
+    /// Returns a boolean value indicating whether the taxon exists in the taxon list.
     pub fn taxon_exists(&self, taxon: TaxonId) -> bool {
         self.taxon_list.get(taxon).is_some()
     }
 
+    /// Snaps a taxon to its closest ancestor in the taxonomic tree.
+    ///
+    /// # Arguments
+    ///
+    /// * `taxon` - The taxon ID to snap.
+    ///
+    /// # Returns
+    ///
+    /// Returns the snapped taxon ID, or panics if the taxon cannot be snapped.
     pub fn snap_taxon(&self, taxon: TaxonId) -> TaxonId {
         self.snapping[taxon].unwrap_or_else(|| panic!("Could not snap taxon with id {taxon}"))
     }
 
+    /// Aggregates a list of taxon IDs using the specified aggregation method.
+    ///
+    /// # Arguments
+    ///
+    /// * `taxa` - A vector of taxon IDs to aggregate.
+    ///
+    /// # Returns
+    ///
+    /// Returns the aggregated taxon ID, or panics if aggregation fails.
     pub fn aggregate(&self, taxa: Vec<TaxonId>) -> TaxonId {
         let count = count(taxa.into_iter().map(|t| (t, 1.0)));
         self.aggregator.aggregate(&count).unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count))