Skip to content

Commit

Permalink
provide documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed Apr 8, 2024
1 parent 1580075 commit 8fa2882
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 3 deletions.
11 changes: 11 additions & 0 deletions sa-mappings/src/functionality.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
//! This module contains the FunctionAggregator struct that is responsible for aggregating the functional annotations of proteins.
use crate::proteins::Protein;

/// A struct that represents a function aggregator
pub struct FunctionAggregator {}

impl FunctionAggregator {
/// Aggregates the functional annotations of proteins
///
/// # Arguments
/// * `proteins` - A vector of proteins
///
/// # Returns
///
/// Returns a string containing the aggregated functional annotations
pub fn aggregate(&self, proteins: Vec<Protein>) -> String {
proteins
.iter()
Expand Down
5 changes: 5 additions & 0 deletions sa-mappings/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
//! This library provides functionality to map protein sequences to their respective taxonomic identifiers
//! and functional annotations.
#![warn(missing_docs)]

pub mod proteins;
pub mod taxonomy;
pub mod functionality;
41 changes: 38 additions & 3 deletions sa-mappings/src/proteins.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
//! This module contains the `Protein` and `Proteins` structs, which are used to represent proteins and collections of proteins, respectively.
use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8};

use bytelines::ByteLines;
use fa_compression::decode;
use fa_compression::algorithm1::decode;
use umgap::taxon::TaxonId;

use crate::taxonomy::TaxonAggregator;

/// The separation character used in the input string
pub static SEPARATION_CHARACTER: u8 = b'-';

/// The termination character used in the input string
/// This character should be smaller than the separation character
pub static TERMINATION_CHARACTER: u8 = b'$';

/// A struct that represents a protein and its linked information
#[derive(Debug)]
pub struct Protein {
/// The id of the protein
Expand All @@ -24,6 +31,7 @@ pub struct Protein {
pub functional_annotations: Vec<u8>,
}

/// A struct that represents a collection of proteins
#[derive(Debug)]
pub struct Proteins {
/// The input string containing all proteins
Expand All @@ -34,12 +42,26 @@ pub struct Proteins {
}

impl Protein {
/// Returns the decoded functional annotations of the protein
pub fn get_functional_annotations(&self) -> String {
decode(&self.functional_annotations)
}
}

impl Proteins {
/// Creates a new `Proteins` struct from a database file and a `TaxonAggregator`
///
/// # Arguments
/// * `file` - The path to the database file
/// * `taxon_aggregator` - The `TaxonAggregator` to use
///
/// # Returns
///
/// Returns a `Result` containing the `Proteins` struct
///
/// # Errors
///
/// Returns a `Box<dyn Error>` if an error occurred while reading the database file
pub fn try_from_database_file(file: &str, taxon_aggregator: &TaxonAggregator) -> Result<Self, Box<dyn Error>> {
let mut input_string: String = String::new();
let mut proteins: Vec<Protein> = Vec::new();
Expand All @@ -48,10 +70,14 @@ impl Proteins {

let mut start_index = 0;

// Read the lines as bytes, since the input string is not guaranteed to be utf8
// because of the encoded functional annotations
let mut lines = ByteLines::new(BufReader::new(file));

while let Some(Ok(line)) = lines.next() {
let mut fields = line.split(|b| *b == b'\t');

// uniprot_id, taxon_id and sequence should always contain valid utf8
let uniprot_id = from_utf8(fields.next().unwrap())?;
let taxon_id = from_utf8(fields.next().unwrap())?.parse::<TaxonId>()?;
let sequence = from_utf8(fields.next().unwrap())?;
Expand Down Expand Up @@ -80,11 +106,20 @@ impl Proteins {
Ok(Self { input_string: input_string.into_bytes(), proteins })
}

/// Returns the sequence of a protein
///
/// # Arguments
/// * `protein` - The protein to get the sequence from
///
/// # Returns
///
/// Returns a string slice containing the sequence of the protein
pub fn get_sequence(&self, protein: &Protein) -> &str {
let (start, length) = protein.sequence;
let end = start + length as usize;

std::str::from_utf8(&self.input_string[start..end]).unwrap() // should never fail since the input string will always be utf8
// unwrap should never fail since the input string will always be utf8
std::str::from_utf8(&self.input_string[start..end]).unwrap()
}
}

Expand All @@ -102,7 +137,7 @@ mod tests {
use std::io::Write;
use std::path::PathBuf;

use fa_compression::decode;
use fa_compression::algorithm1::decode;
use tempdir::TempDir;

use crate::taxonomy::AggregationMethod;
Expand Down
54 changes: 54 additions & 0 deletions sa-mappings/src/taxonomy.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,46 @@
//! This module provides a `TaxonAggregator` struct that is used to aggregate taxonomic information.
//! It uses a taxonomy file to create a taxonomic tree and performs aggregation using different methods.
use std::error::Error;

use umgap::{agg::{count, MultiThreadSafeAggregator}, rmq::{lca::LCACalculator, mix::MixCalculator}, taxon::{read_taxa_file, TaxonId, TaxonList, TaxonTree}};

/// A struct that represents a taxon aggregator.
pub struct TaxonAggregator {
/// A vector that contains the snapped taxon IDs.
snapping: Vec<Option<TaxonId>>,

/// The aggregator used to aggregate taxon IDs.
aggregator: Box<dyn MultiThreadSafeAggregator>,

/// The taxon list.
taxon_list: TaxonList
}

/// An enum that specifies the aggregation method to use.
pub enum AggregationMethod {
/// The Lowest Common Ancestor (LCA) aggregation method.
Lca,

/// The LCA* aggregation method.
LcaStar
}

impl TaxonAggregator {
/// Creates a new `TaxonAggregator` from a taxonomy file and an aggregation method.
///
/// # Arguments
///
/// * `file` - A string slice that represents the path to the taxonomy file.
/// * `method` - An `AggregationMethod` enum that specifies the aggregation method to use.
///
/// # Returns
///
/// Returns a `Result` containing the `TaxonAggregator`
///
/// # Errors
///
/// Returns a `Box<dyn Error>` if an error occurred while reading the taxonomy file.
pub fn try_from_taxonomy_file(file: &str, method: AggregationMethod) -> Result<Self, Box<dyn Error>> {
let taxons = read_taxa_file(file)?;
let taxon_tree = TaxonTree::new(&taxons);
Expand All @@ -28,14 +55,41 @@ impl TaxonAggregator {
Ok(Self { snapping, aggregator, taxon_list })
}

/// Checks if a taxon exists in the taxon list.
///
/// # Arguments
///
/// * `taxon` - The taxon ID to check.
///
/// # Returns
///
/// Returns a boolean value indicating whether the taxon exists in the taxon list.
pub fn taxon_exists(&self, taxon: TaxonId) -> bool {
self.taxon_list.get(taxon).is_some()
}

/// Snaps a taxon to its closest ancestor in the taxonomic tree.
///
/// # Arguments
///
/// * `taxon` - The taxon ID to snap.
///
/// # Returns
///
/// Returns the snapped taxon ID, or panics if the taxon cannot be snapped.
pub fn snap_taxon(&self, taxon: TaxonId) -> TaxonId {
self.snapping[taxon].unwrap_or_else(|| panic!("Could not snap taxon with id {taxon}"))
}

/// Aggregates a list of taxon IDs using the specified aggregation method.
///
/// # Arguments
///
/// * `taxa` - A vector of taxon IDs to aggregate.
///
/// # Returns
///
/// Returns the aggregated taxon ID, or panics if aggregation fails.
pub fn aggregate(&self, taxa: Vec<TaxonId>) -> TaxonId {
let count = count(taxa.into_iter().map(|t| (t, 1.0)));
self.aggregator.aggregate(&count).unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count))
Expand Down

0 comments on commit 8fa2882

Please sign in to comment.