diff --git a/Cargo.lock b/Cargo.lock index e627d58..3ec5475 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -113,6 +113,16 @@ version = "3.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" +[[package]] +name = "bytelines" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1297656b3c221f5251560da47ce530d981345d3dabe822067c18ecb36e67aacb" +dependencies = [ + "futures-util", + "tokio", +] + [[package]] name = "byteorder" version = "1.5.0" @@ -419,6 +429,30 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "pin-utils", +] + [[package]] name = "getrandom" version = "0.2.12" @@ -550,17 +584,6 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" -[[package]] -name = "mappings" -version = "0.1.0" -dependencies = [ - "fa-compression", - "memchr", - "static_assertions", - "tempdir", - "umgap", -] - [[package]] name = "memchr" version = "2.7.2" @@ -693,6 +716,18 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.30" @@ -925,6 +960,16 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +[[package]] +name = "sa-mappings" +version = "0.1.0" +dependencies = [ + "bytelines", + "fa-compression", + "tempdir", + "umgap", +] + [[package]] name = "same-file" version = "1.0.6" @@ -997,12 +1042,6 @@ dependencies = [ "serde", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "strsim" version = "0.8.0" @@ -1129,6 +1168,17 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokio" +version = "1.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" +dependencies = [ + "backtrace", + "bytes", + "pin-project-lite", +] + [[package]] name = "umgap" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index 223bbaf..6c02f58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,5 +3,5 @@ resolver = "2" members = [ "fa-compression", - "mappings" + "sa-mappings" ] diff --git a/mappings/src/functionality.rs b/mappings/src/functionality.rs deleted file mode 100644 index e69de29..0000000 diff --git a/mappings/src/lib.rs b/mappings/src/lib.rs deleted file mode 100644 index ecf4839..0000000 --- a/mappings/src/lib.rs +++ /dev/null @@ -1,23 +0,0 @@ -use std::error::Error; - -mod proteins; -mod taxonomy; - -#[derive(Debug)] -struct DatabaseFormatError { - error: Vec -} - -impl DatabaseFormatError { - fn new(error: Vec) -> Self { - Self { error } - } -} - -impl std::fmt::Display for DatabaseFormatError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Expected the protein database file to have the following fields separated by a tab: \nBut tried to unpack following vector in 3 variables: {:?}", self.error) - } -} - -impl Error for DatabaseFormatError {} diff --git a/mappings/Cargo.toml b/sa-mappings/Cargo.toml similarity index 80% rename from mappings/Cargo.toml rename to sa-mappings/Cargo.toml index 7b01467..f02934a 100644 --- a/mappings/Cargo.toml +++ b/sa-mappings/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "mappings" +name = "sa-mappings" version = "0.1.0" edition = "2021" @@ -7,9 +7,8 @@ edition = "2021" [dev-dependencies] tempdir = "0.3.7" -static_assertions = "1.1.0" [dependencies] fa-compression = { path = "../fa-compression" } -memchr = "2.5.0" +bytelines = "2.5.0" umgap = "1.1.0" diff --git a/sa-mappings/src/functionality.rs b/sa-mappings/src/functionality.rs new file mode 100644 index 0000000..652f775 --- /dev/null +++ b/sa-mappings/src/functionality.rs @@ -0,0 +1,40 @@ +use crate::proteins::Protein; + +pub struct FunctionAggregator {} + +impl FunctionAggregator { + pub fn aggregate(&self, proteins: Vec) -> String { + proteins + .iter() + .map(|protein| protein.get_functional_annotations()) + .collect::>() + .join(";") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_aggregate() { + let proteins = vec![ + Protein { + uniprot_id: "uniprot1".to_string(), + sequence: (0, 3), + taxon_id: 1, + functional_annotations: vec![0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27], + }, + Protein { + uniprot_id: "uniprot2".to_string(), + sequence: (4, 3), + taxon_id: 2, + functional_annotations: vec![0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27], + }, + ]; + + let function_aggregator = FunctionAggregator {}; + + assert_eq!(function_aggregator.aggregate(proteins), "GO:0009279;IPR:IPR016364;IPR:IPR008816;GO:0009279;IPR:IPR016364;IPR:IPR008816"); + } +} diff --git a/sa-mappings/src/lib.rs b/sa-mappings/src/lib.rs new file mode 100644 index 0000000..7b0545d --- /dev/null +++ b/sa-mappings/src/lib.rs @@ -0,0 +1,3 @@ +pub mod proteins; +pub mod taxonomy; +pub mod functionality; diff --git a/mappings/src/proteins.rs b/sa-mappings/src/proteins.rs similarity index 86% rename from mappings/src/proteins.rs rename to sa-mappings/src/proteins.rs index c7ff82e..decf05e 100644 --- a/mappings/src/proteins.rs +++ b/sa-mappings/src/proteins.rs @@ -1,9 +1,10 @@ -use std::{error::Error, fs::File, io::{BufRead, BufReader}, ops::Index}; +use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8}; -use memchr::memchr_iter; +use bytelines::ByteLines; +use fa_compression::decode; use umgap::taxon::TaxonId; -use crate::{taxonomy::TaxonAggregator, DatabaseFormatError}; +use crate::taxonomy::TaxonAggregator; pub static SEPARATION_CHARACTER: u8 = b'-'; pub static TERMINATION_CHARACTER: u8 = b'$'; @@ -19,8 +20,8 @@ pub struct Protein { /// the taxon id of the protein pub taxon_id: TaxonId, - // /// The encoded functional annotations of the protein - functional_annotations: Vec, + /// The encoded functional annotations of the protein + pub functional_annotations: Vec, } #[derive(Debug)] @@ -32,6 +33,12 @@ pub struct Proteins { proteins: Vec, } +impl Protein { + pub fn get_functional_annotations(&self) -> String { + decode(&self.functional_annotations) + } +} + impl Proteins { pub fn try_from_database_file(file: &str, taxon_aggregator: &TaxonAggregator) -> Result> { let mut input_string: String = String::new(); @@ -41,19 +48,14 @@ impl Proteins { let mut start_index = 0; - let mut reader = BufReader::new(file); + let mut lines = ByteLines::new(BufReader::new(file)); - let mut buffer = Vec::new(); - println!("{:?}", reader.read_until(b'\n', &mut buffer)); - - println!("{:?}", buffer); - - for line in reader.lines().into_iter().map_while(Result::ok) { - println!("{:?}", line); - let fields: Vec = line.split('\t').map(str::to_string).collect(); - let [uniprot_id, taxon_id, sequence, fa]: [String; 4] = fields.try_into().map_err(DatabaseFormatError::new)?; - println!("{:?}", taxon_id); - let taxon_id = taxon_id.parse::()?; + while let Some(Ok(line)) = lines.next() { + let mut fields = line.split(|b| *b == b'\t'); + let uniprot_id = from_utf8(fields.next().unwrap())?; + let taxon_id = from_utf8(fields.next().unwrap())?.parse::()?; + let sequence = from_utf8(fields.next().unwrap())?; + let functional_annotations: Vec = fields.next().unwrap().iter().copied().collect(); if !taxon_aggregator.taxon_exists(taxon_id) { continue; @@ -63,10 +65,10 @@ impl Proteins { input_string.push(SEPARATION_CHARACTER.into()); proteins.push(Protein { - uniprot_id, + uniprot_id: uniprot_id.to_string(), sequence: (start_index, sequence.len() as u32), taxon_id, - functional_annotations: fa.as_bytes().to_vec(), + functional_annotations }); start_index += sequence.len() + 1; @@ -160,8 +162,6 @@ mod tests { let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap(); let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator).unwrap(); - println!("{:?}", proteins); - //assert_eq!(proteins.proteins.len(), 4); assert_eq!(proteins.get_sequence(&proteins[0]), "MLPGLALLLLAAWTARALEV"); assert_eq!(proteins.get_sequence(&proteins[1]), "PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG"); @@ -198,7 +198,6 @@ mod tests { let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator).unwrap(); for protein in proteins.proteins.iter() { - println!("{:?}", protein.functional_annotations); assert_eq!(decode(&protein.functional_annotations), "GO:0009279;IPR:IPR016364;IPR:IPR008816"); } } diff --git a/mappings/src/taxonomy.rs b/sa-mappings/src/taxonomy.rs similarity index 100% rename from mappings/src/taxonomy.rs rename to sa-mappings/src/taxonomy.rs