diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs index 18f4410..b26152b 100644 --- a/sa-mappings/src/functionality.rs +++ b/sa-mappings/src/functionality.rs @@ -1,4 +1,5 @@ -//! This module contains the FunctionAggregator struct that is responsible for aggregating the functional annotations of proteins. +//! This module contains the FunctionAggregator struct that is responsible for aggregating the +//! functional annotations of proteins. use crate::proteins::Protein; @@ -7,12 +8,12 @@ pub struct FunctionAggregator {} impl FunctionAggregator { /// Aggregates the functional annotations of proteins - /// + /// /// # Arguments /// * `proteins` - A vector of proteins - /// + /// /// # Returns - /// + /// /// Returns a string containing the aggregated functional annotations pub fn aggregate(&self, proteins: Vec) -> String { proteins @@ -31,21 +32,28 @@ mod tests { fn test_aggregate() { let proteins = vec![ Protein { - uniprot_id: "uniprot1".to_string(), - sequence: (0, 3), - taxon_id: 1, - functional_annotations: vec![0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27], + uniprot_id: "uniprot1".to_string(), + sequence: (0, 3), + taxon_id: 1, + functional_annotations: vec![ + 0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27, + ] }, Protein { - uniprot_id: "uniprot2".to_string(), - sequence: (4, 3), - taxon_id: 2, - functional_annotations: vec![0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27], + uniprot_id: "uniprot2".to_string(), + sequence: (4, 3), + taxon_id: 2, + functional_annotations: vec![ + 0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27, + ] }, ]; let function_aggregator = FunctionAggregator {}; - assert_eq!(function_aggregator.aggregate(proteins), "GO:0009279;IPR:IPR016364;IPR:IPR008816;GO:0009279;IPR:IPR016364;IPR:IPR008816"); + assert_eq!( + function_aggregator.aggregate(proteins), + "GO:0009279;IPR:IPR016364;IPR:IPR008816;GO:0009279;IPR:IPR016364;IPR:IPR008816" + ); } } diff --git a/sa-mappings/src/lib.rs b/sa-mappings/src/lib.rs index 68fc8bd..6986c13 100644 --- a/sa-mappings/src/lib.rs +++ b/sa-mappings/src/lib.rs @@ -1,8 +1,8 @@ -//! This library provides functionality to map protein sequences to their respective taxonomic identifiers -//! and functional annotations. +//! This library provides functionality to map protein sequences to their respective taxonomic +//! identifiers and functional annotations. #![warn(missing_docs)] +pub mod functionality; pub mod proteins; pub mod taxonomy; -pub mod functionality; diff --git a/sa-mappings/src/proteins.rs b/sa-mappings/src/proteins.rs index d209ef4..786e700 100644 --- a/sa-mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -1,6 +1,13 @@ -//! This module contains the `Protein` and `Proteins` structs, which are used to represent proteins and collections of proteins, respectively. +//! This module contains the `Protein` and `Proteins` structs, which are used to represent proteins +//! and collections of proteins, respectively. -use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8}; +use std::{ + error::Error, + fs::File, + io::BufReader, + ops::Index, + str::from_utf8 +}; use bytelines::ByteLines; use fa_compression::algorithm1::decode; @@ -28,7 +35,7 @@ pub struct Protein { pub taxon_id: TaxonId, /// The encoded functional annotations of the protein - pub functional_annotations: Vec, + pub functional_annotations: Vec } /// A struct that represents a collection of proteins @@ -38,7 +45,7 @@ pub struct Proteins { input_string: Vec, /// The proteins in the input string - proteins: Vec, + proteins: Vec } impl Protein { @@ -50,19 +57,22 @@ impl Protein { impl Proteins { /// Creates a new `Proteins` struct from a database file and a `TaxonAggregator` - /// + /// /// # Arguments /// * `file` - The path to the database file /// * `taxon_aggregator` - The `TaxonAggregator` to use - /// + /// /// # Returns - /// + /// /// Returns a `Result` containing the `Proteins` struct - /// + /// /// # Errors - /// + /// /// Returns a `Box` if an error occurred while reading the database file - pub fn try_from_database_file(file: &str, taxon_aggregator: &TaxonAggregator) -> Result> { + pub fn try_from_database_file( + file: &str, + taxon_aggregator: &TaxonAggregator + ) -> Result> { let mut input_string: String = String::new(); let mut proteins: Vec = Vec::new(); @@ -81,7 +91,7 @@ impl Proteins { let uniprot_id = from_utf8(fields.next().unwrap())?; let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; let sequence = from_utf8(fields.next().unwrap())?; - let functional_annotations: Vec = fields.next().unwrap().iter().copied().collect(); + let functional_annotations: Vec = fields.next().unwrap().to_vec(); if !taxon_aggregator.taxon_exists(taxon_id) { continue; @@ -103,23 +113,26 @@ impl Proteins { input_string.pop(); input_string.push(TERMINATION_CHARACTER.into()); - Ok(Self { input_string: input_string.into_bytes(), proteins }) + Ok(Self { + input_string: input_string.into_bytes(), + proteins + }) } /// Returns the sequence of a protein /// /// # Arguments /// * `protein` - The protein to get the sequence from - /// + /// /// # Returns - /// + /// /// Returns a string slice containing the sequence of the protein pub fn get_sequence(&self, protein: &Protein) -> &str { let (start, length) = protein.sequence; let end = start + length as usize; // unwrap should never fail since the input string will always be utf8 - std::str::from_utf8(&self.input_string[start..end]).unwrap() + std::str::from_utf8(&self.input_string[start .. end]).unwrap() } } @@ -133,32 +146,41 @@ impl Index for Proteins { #[cfg(test)] mod tests { - use std::fs::File; - use std::io::Write; - use std::path::PathBuf; + use std::{ + fs::File, + io::Write, + path::PathBuf + }; use fa_compression::algorithm1::decode; use tempdir::TempDir; - use crate::taxonomy::AggregationMethod; - use super::*; + use crate::taxonomy::AggregationMethod; fn create_database_file(tmp_dir: &TempDir) -> PathBuf { let database_file = tmp_dir.path().join("database.tsv"); let mut file = File::create(&database_file).unwrap(); - file.write("P12345\t1\tMLPGLALLLLAAWTARALEV\t".as_bytes()).unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]).unwrap(); + file.write("P12345\t1\tMLPGLALLLLAAWTARALEV\t".as_bytes()) + .unwrap(); + file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) + .unwrap(); file.write("\n".as_bytes()).unwrap(); - file.write("P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\t".as_bytes()).unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]).unwrap(); + file.write("P54321\t2\tPTDGNAGLLAEPQIAMFCGRLNMHMNVQNG\t".as_bytes()) + .unwrap(); + file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) + .unwrap(); file.write("\n".as_bytes()).unwrap(); - file.write("P67890\t6\tKWDSDPSGTKTCIDT\t".as_bytes()).unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]).unwrap(); + file.write("P67890\t6\tKWDSDPSGTKTCIDT\t".as_bytes()) + .unwrap(); + file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) + .unwrap(); file.write("\n".as_bytes()).unwrap(); - file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\t".as_bytes()).unwrap(); - file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]).unwrap(); + file.write("P13579\t17\tKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH\t".as_bytes()) + .unwrap(); + file.write_all(&[0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27]) + .unwrap(); file.write("\n".as_bytes()).unwrap(); database_file @@ -194,14 +216,23 @@ mod tests { let database_file = create_database_file(&tmp_dir); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); - let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator).unwrap(); + let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::Lca + ) + .unwrap(); + let proteins = + Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) + .unwrap(); //assert_eq!(proteins.proteins.len(), 4); assert_eq!(proteins.get_sequence(&proteins[0]), "MLPGLALLLLAAWTARALEV"); assert_eq!(proteins.get_sequence(&proteins[1]), "PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG"); assert_eq!(proteins.get_sequence(&proteins[2]), "KWDSDPSGTKTCIDT"); - assert_eq!(proteins.get_sequence(&proteins[3]), "KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH"); + assert_eq!( + proteins.get_sequence(&proteins[3]), + "KEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPH" + ); } #[test] @@ -212,8 +243,14 @@ mod tests { let database_file = create_database_file(&tmp_dir); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); - let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator).unwrap(); + let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::Lca + ) + .unwrap(); + let proteins = + Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) + .unwrap(); let taxa = vec![1, 2, 6, 17]; for (i, protein) in proteins.proteins.iter().enumerate() { @@ -229,11 +266,20 @@ mod tests { let database_file = create_database_file(&tmp_dir); let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); - let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator).unwrap(); + let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::Lca + ) + .unwrap(); + let proteins = + Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator) + .unwrap(); for protein in proteins.proteins.iter() { - assert_eq!(decode(&protein.functional_annotations), "GO:0009279;IPR:IPR016364;IPR:IPR008816"); + assert_eq!( + decode(&protein.functional_annotations), + "GO:0009279;IPR:IPR016364;IPR:IPR008816" + ); } } } diff --git a/sa-mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs index cc364e1..ada93ff 100644 --- a/sa-mappings/src/taxonomy.rs +++ b/sa-mappings/src/taxonomy.rs @@ -1,9 +1,25 @@ //! This module provides a `TaxonAggregator` struct that is used to aggregate taxonomic information. -//! It uses a taxonomy file to create a taxonomic tree and performs aggregation using different methods. +//! It uses a taxonomy file to create a taxonomic tree and performs aggregation using different +//! methods. use std::error::Error; -use umgap::{agg::{count, MultiThreadSafeAggregator}, rmq::{lca::LCACalculator, mix::MixCalculator}, taxon::{read_taxa_file, TaxonId, TaxonList, TaxonTree}}; +use umgap::{ + agg::{ + count, + MultiThreadSafeAggregator + }, + rmq::{ + lca::LCACalculator, + mix::MixCalculator + }, + taxon::{ + read_taxa_file, + TaxonId, + TaxonList, + TaxonTree + } +}; /// A struct that represents a taxon aggregator. pub struct TaxonAggregator { @@ -37,11 +53,14 @@ impl TaxonAggregator { /// # Returns /// /// Returns a `Result` containing the `TaxonAggregator` - /// + /// /// # Errors - /// + /// /// Returns a `Box` if an error occurred while reading the taxonomy file. - pub fn try_from_taxonomy_file(file: &str, method: AggregationMethod) -> Result> { + pub fn try_from_taxonomy_file( + file: &str, + method: AggregationMethod + ) -> Result> { let taxons = read_taxa_file(file)?; let taxon_tree = TaxonTree::new(&taxons); let taxon_list = TaxonList::new(taxons); @@ -49,10 +68,14 @@ impl TaxonAggregator { let aggregator: Box = match method { AggregationMethod::Lca => Box::new(MixCalculator::new(taxon_tree, 1.0)), - AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)), + AggregationMethod::LcaStar => Box::new(LCACalculator::new(taxon_tree)) }; - Ok(Self { snapping, aggregator, taxon_list }) + Ok(Self { + snapping, + aggregator, + taxon_list + }) } /// Checks if a taxon exists in the taxon list. @@ -92,15 +115,19 @@ impl TaxonAggregator { /// Returns the aggregated taxon ID, or panics if aggregation fails. pub fn aggregate(&self, taxa: Vec) -> TaxonId { let count = count(taxa.into_iter().map(|t| (t, 1.0))); - self.aggregator.aggregate(&count).unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count)) + self.aggregator + .aggregate(&count) + .unwrap_or_else(|_| panic!("Could not aggregate following taxon ids: {:?}", &count)) } } #[cfg(test)] mod tests { - use std::fs::File; - use std::io::Write; - use std::path::PathBuf; + use std::{ + fs::File, + io::Write, + path::PathBuf + }; use tempdir::TempDir; @@ -132,24 +159,36 @@ mod tests { fn test_try_from_taxonomy_file() { // Create a temporary directory for this test let tmp_dir = TempDir::new("test_try_from_taxonomy_file").unwrap(); - + let taxonomy_file = create_taxonomy_file(&tmp_dir); - let _ = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); - let _ = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(); + let _ = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::Lca + ) + .unwrap(); + let _ = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(); } #[test] fn test_taxon_exists() { // Create a temporary directory for this test let tmp_dir = TempDir::new("test_taxon_exists").unwrap(); - + let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); + let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::Lca + ) + .unwrap(); - for i in 0..=20 { - if [ 0, 3, 4, 5, 8, 12, 15 ].contains(&i) { + for i in 0 ..= 20 { + if [0, 3, 4, 5, 8, 12, 15].contains(&i) { assert!(!taxon_aggregator.taxon_exists(i)); } else { assert!(taxon_aggregator.taxon_exists(i)); @@ -161,13 +200,17 @@ mod tests { fn test_snap_taxon() { // Create a temporary directory for this test let tmp_dir = TempDir::new("test_snap_taxon").unwrap(); - + let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); + let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::Lca + ) + .unwrap(); - for i in 0..=20 { - if ![ 0, 3, 4, 5, 8, 12, 15 ].contains(&i) { + for i in 0 ..= 20 { + if ![0, 3, 4, 5, 8, 12, 15].contains(&i) { assert_eq!(taxon_aggregator.snap_taxon(i), i); } } @@ -177,27 +220,35 @@ mod tests { fn test_aggregate_lca() { // Create a temporary directory for this test let tmp_dir = TempDir::new("test_aggregate").unwrap(); - + let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); + let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::Lca + ) + .unwrap(); - assert_eq!(taxon_aggregator.aggregate(vec![ 7, 9 ]), 6); - assert_eq!(taxon_aggregator.aggregate(vec![ 11, 14 ]), 10); - assert_eq!(taxon_aggregator.aggregate(vec![ 17, 19 ]), 17); + assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), 6); + assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), 10); + assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), 17); } #[test] fn test_aggregate_lca_star() { // Create a temporary directory for this test let tmp_dir = TempDir::new("test_aggregate").unwrap(); - + let taxonomy_file = create_taxonomy_file(&tmp_dir); - let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::LcaStar).unwrap(); + let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file( + taxonomy_file.to_str().unwrap(), + AggregationMethod::LcaStar + ) + .unwrap(); - assert_eq!(taxon_aggregator.aggregate(vec![ 7, 9 ]), 6); - assert_eq!(taxon_aggregator.aggregate(vec![ 11, 14 ]), 10); - assert_eq!(taxon_aggregator.aggregate(vec![ 17, 19 ]), 19); + assert_eq!(taxon_aggregator.aggregate(vec![7, 9]), 6); + assert_eq!(taxon_aggregator.aggregate(vec![11, 14]), 10); + assert_eq!(taxon_aggregator.aggregate(vec![17, 19]), 19); } }