Skip to content

Commit

Permalink
Add all mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
tibvdm committed Apr 8, 2024
1 parent 19bb802 commit a10eafe
Show file tree
Hide file tree
Showing 9 changed files with 134 additions and 66 deletions.
84 changes: 67 additions & 17 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ resolver = "2"

members = [
"fa-compression",
"mappings"
"sa-mappings"
]
Empty file removed mappings/src/functionality.rs
Empty file.
23 changes: 0 additions & 23 deletions mappings/src/lib.rs

This file was deleted.

5 changes: 2 additions & 3 deletions mappings/Cargo.toml → sa-mappings/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
[package]
name = "mappings"
name = "sa-mappings"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dev-dependencies]
tempdir = "0.3.7"
static_assertions = "1.1.0"

[dependencies]
fa-compression = { path = "../fa-compression" }
memchr = "2.5.0"
bytelines = "2.5.0"
umgap = "1.1.0"
40 changes: 40 additions & 0 deletions sa-mappings/src/functionality.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use crate::proteins::Protein;

pub struct FunctionAggregator {}

impl FunctionAggregator {
pub fn aggregate(&self, proteins: Vec<Protein>) -> String {
proteins
.iter()
.map(|protein| protein.get_functional_annotations())
.collect::<Vec<String>>()
.join(";")
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_aggregate() {
let proteins = vec![
Protein {
uniprot_id: "uniprot1".to_string(),
sequence: (0, 3),
taxon_id: 1,
functional_annotations: vec![0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27],
},
Protein {
uniprot_id: "uniprot2".to_string(),
sequence: (4, 3),
taxon_id: 2,
functional_annotations: vec![0xD1, 0x11, 0xA3, 0x8A, 0xD1, 0x27, 0x47, 0x5E, 0x11, 0x99, 0x27],
},
];

let function_aggregator = FunctionAggregator {};

assert_eq!(function_aggregator.aggregate(proteins), "GO:0009279;IPR:IPR016364;IPR:IPR008816;GO:0009279;IPR:IPR016364;IPR:IPR008816");
}
}
3 changes: 3 additions & 0 deletions sa-mappings/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub mod proteins;
pub mod taxonomy;
pub mod functionality;
43 changes: 21 additions & 22 deletions mappings/src/proteins.rs → sa-mappings/src/proteins.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::{error::Error, fs::File, io::{BufRead, BufReader}, ops::Index};
use std::{error::Error, fs::File, io::BufReader, ops::Index, str::from_utf8};

use memchr::memchr_iter;
use bytelines::ByteLines;
use fa_compression::decode;
use umgap::taxon::TaxonId;

use crate::{taxonomy::TaxonAggregator, DatabaseFormatError};
use crate::taxonomy::TaxonAggregator;

pub static SEPARATION_CHARACTER: u8 = b'-';
pub static TERMINATION_CHARACTER: u8 = b'$';
Expand All @@ -19,8 +20,8 @@ pub struct Protein {
/// the taxon id of the protein
pub taxon_id: TaxonId,

// /// The encoded functional annotations of the protein
functional_annotations: Vec<u8>,
/// The encoded functional annotations of the protein
pub functional_annotations: Vec<u8>,
}

#[derive(Debug)]
Expand All @@ -32,6 +33,12 @@ pub struct Proteins {
proteins: Vec<Protein>,
}

impl Protein {
pub fn get_functional_annotations(&self) -> String {
decode(&self.functional_annotations)
}
}

impl Proteins {
pub fn try_from_database_file(file: &str, taxon_aggregator: &TaxonAggregator) -> Result<Self, Box<dyn Error>> {
let mut input_string: String = String::new();
Expand All @@ -41,19 +48,14 @@ impl Proteins {

let mut start_index = 0;

let mut reader = BufReader::new(file);
let mut lines = ByteLines::new(BufReader::new(file));

let mut buffer = Vec::new();
println!("{:?}", reader.read_until(b'\n', &mut buffer));

println!("{:?}", buffer);

for line in reader.lines().into_iter().map_while(Result::ok) {
println!("{:?}", line);
let fields: Vec<String> = line.split('\t').map(str::to_string).collect();
let [uniprot_id, taxon_id, sequence, fa]: [String; 4] = fields.try_into().map_err(DatabaseFormatError::new)?;
println!("{:?}", taxon_id);
let taxon_id = taxon_id.parse::<TaxonId>()?;
while let Some(Ok(line)) = lines.next() {
let mut fields = line.split(|b| *b == b'\t');
let uniprot_id = from_utf8(fields.next().unwrap())?;
let taxon_id = from_utf8(fields.next().unwrap())?.parse::<TaxonId>()?;
let sequence = from_utf8(fields.next().unwrap())?;
let functional_annotations: Vec<u8> = fields.next().unwrap().iter().copied().collect();

if !taxon_aggregator.taxon_exists(taxon_id) {
continue;
Expand All @@ -63,10 +65,10 @@ impl Proteins {
input_string.push(SEPARATION_CHARACTER.into());

proteins.push(Protein {
uniprot_id,
uniprot_id: uniprot_id.to_string(),
sequence: (start_index, sequence.len() as u32),
taxon_id,
functional_annotations: fa.as_bytes().to_vec(),
functional_annotations
});

start_index += sequence.len() + 1;
Expand Down Expand Up @@ -160,8 +162,6 @@ mod tests {
let taxon_aggregator = TaxonAggregator::try_from_taxonomy_file(taxonomy_file.to_str().unwrap(), AggregationMethod::Lca).unwrap();
let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator).unwrap();

println!("{:?}", proteins);

//assert_eq!(proteins.proteins.len(), 4);
assert_eq!(proteins.get_sequence(&proteins[0]), "MLPGLALLLLAAWTARALEV");
assert_eq!(proteins.get_sequence(&proteins[1]), "PTDGNAGLLAEPQIAMFCGRLNMHMNVQNG");
Expand Down Expand Up @@ -198,7 +198,6 @@ mod tests {
let proteins = Proteins::try_from_database_file(database_file.to_str().unwrap(), &taxon_aggregator).unwrap();

for protein in proteins.proteins.iter() {
println!("{:?}", protein.functional_annotations);
assert_eq!(decode(&protein.functional_annotations), "GO:0009279;IPR:IPR016364;IPR:IPR008816");
}
}
Expand Down
File renamed without changes.

0 comments on commit a10eafe

Please sign in to comment.