diff --git a/api/Cargo.toml b/api/Cargo.toml index bef0020..ee0241d 100644 --- a/api/Cargo.toml +++ b/api/Cargo.toml @@ -26,3 +26,4 @@ http = "1.1.0" tower-layer = "0.3.2" tower-service = "0.3.2" itertools = "0.13.0" +reqwest = { version = "0.12.8", features = [ "json" ] } diff --git a/api/src/controllers/api/mod.rs b/api/src/controllers/api/mod.rs index 13fe3d5..753c874 100644 --- a/api/src/controllers/api/mod.rs +++ b/api/src/controllers/api/mod.rs @@ -40,3 +40,7 @@ pub fn default_descendants_ranks() -> Vec { pub fn default_link() -> bool { false } + +pub fn default_compact() -> bool { + false +} diff --git a/api/src/controllers/api/pept2ec.rs b/api/src/controllers/api/pept2ec.rs index fc03bb2..de32f9f 100644 --- a/api/src/controllers/api/pept2ec.rs +++ b/api/src/controllers/api/pept2ec.rs @@ -44,7 +44,7 @@ async fn handler( } let unique_peptides: Vec = peptide_counts.keys().cloned().collect(); - let result = index.analyse(&unique_peptides, equate_il, None, None); + let result = index.analyse(&unique_peptides, equate_il, false, None); let ec_store = datastore.ec_store(); diff --git a/api/src/controllers/api/pept2funct.rs b/api/src/controllers/api/pept2funct.rs index f5cc2ed..12fce15 100644 --- a/api/src/controllers/api/pept2funct.rs +++ b/api/src/controllers/api/pept2funct.rs @@ -42,7 +42,7 @@ async fn handler( Parameters { input, equate_il, extra, domains }: Parameters ) -> Result, ()> { let input = sanitize_peptides(input); - let result = index.analyse(&input, equate_il, None, None); + let result = index.analyse(&input, equate_il, false, None); let ec_store = datastore.ec_store(); let go_store = datastore.go_store(); diff --git a/api/src/controllers/api/pept2go.rs b/api/src/controllers/api/pept2go.rs index 8e74377..fd84e3a 100644 --- a/api/src/controllers/api/pept2go.rs +++ b/api/src/controllers/api/pept2go.rs @@ -38,7 +38,7 @@ async fn handler( Parameters { input, equate_il, extra, domains }: Parameters ) -> Result, ()> { let input = sanitize_peptides(input); - let result = index.analyse(&input, equate_il, None, None); + let result = index.analyse(&input, equate_il, false, None); let go_store = datastore.go_store(); diff --git a/api/src/controllers/api/pept2interpro.rs b/api/src/controllers/api/pept2interpro.rs index bade092..acee5dd 100644 --- a/api/src/controllers/api/pept2interpro.rs +++ b/api/src/controllers/api/pept2interpro.rs @@ -38,7 +38,7 @@ async fn handler( Parameters { input, equate_il, extra, domains }: Parameters ) -> Result, ()> { let input = sanitize_peptides(input); - let result = index.analyse(&input, equate_il, None, None); + let result = index.analyse(&input, equate_il, false, None); let interpro_store = datastore.interpro_store(); diff --git a/api/src/controllers/api/pept2lca.rs b/api/src/controllers/api/pept2lca.rs index 60ab7a3..a8042de 100644 --- a/api/src/controllers/api/pept2lca.rs +++ b/api/src/controllers/api/pept2lca.rs @@ -51,7 +51,7 @@ async fn handler( version: LineageVersion ) -> Result, ()> { let input = sanitize_peptides(input); - let result = index.analyse(&input, equate_il, None, None); + let result = index.analyse(&input, equate_il, false, None); let taxon_store = datastore.taxon_store(); let lineage_store = datastore.lineage_store(); diff --git a/api/src/controllers/api/pept2prot.rs b/api/src/controllers/api/pept2prot.rs index fc6ede8..aa3406d 100644 --- a/api/src/controllers/api/pept2prot.rs +++ b/api/src/controllers/api/pept2prot.rs @@ -1,6 +1,5 @@ use std::collections::HashSet; use axum::{extract::State, Json}; -use itertools::Itertools; use database::get_accessions_map; use serde::{Deserialize, Serialize}; @@ -57,7 +56,7 @@ async fn handler( let connection = database.get_conn().await?; - let result = index.analyse(&input, equate_il, None, Some(tryptic)); + let result = index.analyse(&input, equate_il, tryptic, None); let accession_numbers: HashSet = result .iter() diff --git a/api/src/controllers/api/pept2taxa.rs b/api/src/controllers/api/pept2taxa.rs index 60c7c0b..7e0ef46 100644 --- a/api/src/controllers/api/pept2taxa.rs +++ b/api/src/controllers/api/pept2taxa.rs @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize}; use crate::{ controllers::{ - api::{default_equate_il, default_extra, default_names}, + api::{default_equate_il, default_extra, default_names, default_compact, default_tryptic}, generate_handlers }, helpers::lineage_helper::{ @@ -25,11 +25,22 @@ pub struct Parameters { #[serde(default = "default_extra")] extra: bool, #[serde(default = "default_names")] - names: bool + names: bool, + #[serde(default = "default_tryptic")] + tryptic: bool, + #[serde(default = "default_compact")] + compact: bool } #[derive(Serialize)] -pub struct TaxaInformation { +#[serde(untagged)] +pub enum TaxaInformation { + Dense (DenseTaxaInformation), + Compact (CompactTaxaInformation) +} + +#[derive(Serialize)] +pub struct DenseTaxaInformation { peptide: String, #[serde(flatten)] taxon: Taxon, @@ -37,6 +48,12 @@ pub struct TaxaInformation { lineage: Option } +#[derive(Serialize)] +pub struct CompactTaxaInformation { + peptide: String, + taxa: Vec +} + #[derive(Serialize)] pub struct Taxon { taxon_id: u32, @@ -46,15 +63,34 @@ pub struct Taxon { async fn handler( State(AppState { index, datastore, .. }): State, - Parameters { input, equate_il, extra, names }: Parameters, + Parameters { input, equate_il, extra, names, tryptic, compact }: Parameters, version: LineageVersion ) -> Result, ()> { let input = sanitize_peptides(input); - let result = index.analyse(&input, equate_il, None, None); + let result = index.analyse(&input, equate_il, tryptic, None); let taxon_store = datastore.taxon_store(); let lineage_store = datastore.lineage_store(); + if compact { + return Ok(result + .into_iter() + .filter_map(|item| { + let item_taxa: Vec = item.proteins.iter().map(|protein| protein.taxon).filter(|&taxon_id| taxon_store.is_valid(taxon_id)).collect(); + + if item_taxa.is_empty() { + return None; + } + + Some(TaxaInformation::Compact(CompactTaxaInformation { + peptide: item.sequence, + taxa: item_taxa, + })) + }) + .collect() + ) + } + Ok(result .into_iter() .flat_map(|item| { @@ -67,7 +103,7 @@ async fn handler( (false, _) => None }; - Some(TaxaInformation { + Some(TaxaInformation::Dense(DenseTaxaInformation { peptide: item.sequence.clone(), taxon: Taxon { taxon_id: taxon, @@ -75,7 +111,7 @@ async fn handler( taxon_rank: rank.clone().into() }, lineage - }) + })) } ) }) diff --git a/api/src/controllers/api/peptinfo.rs b/api/src/controllers/api/peptinfo.rs index cc41c65..93b4699 100644 --- a/api/src/controllers/api/peptinfo.rs +++ b/api/src/controllers/api/peptinfo.rs @@ -61,7 +61,7 @@ async fn handler( version: LineageVersion ) -> Result, ()> { let input = sanitize_peptides(input); - let result = index.analyse(&input, equate_il, None, None); + let result = index.analyse(&input, equate_il, false, None); let ec_store = datastore.ec_store(); let go_store = datastore.go_store(); diff --git a/api/src/controllers/mpa/pept2data.rs b/api/src/controllers/mpa/pept2data.rs index b871e25..a8e085a 100644 --- a/api/src/controllers/mpa/pept2data.rs +++ b/api/src/controllers/mpa/pept2data.rs @@ -47,7 +47,7 @@ async fn handler( peptides.dedup(); let peptides = sanitize_peptides(peptides); - let result = index.analyse(&peptides, equate_il, Some(10_000), Some(tryptic)); + let result = index.analyse(&peptides, equate_il, tryptic, Some(10_000)); let taxon_store = datastore.taxon_store(); let lineage_store = datastore.lineage_store(); diff --git a/api/src/controllers/mpa/pept2filtered.rs b/api/src/controllers/mpa/pept2filtered.rs index a60b79d..a3a4bee 100644 --- a/api/src/controllers/mpa/pept2filtered.rs +++ b/api/src/controllers/mpa/pept2filtered.rs @@ -1,31 +1,48 @@ +use std::collections::HashSet; use axum::{extract::State, Json}; use serde::{Deserialize, Serialize}; - +use index::{ProteinInfo, SearchResult}; use crate::{ - controllers::{generate_handlers, mpa::default_equate_il, mpa::default_include_fa, mpa::default_tryptic}, + controllers::{generate_handlers, mpa::default_equate_il, mpa::default_tryptic}, helpers::fa_helper::{calculate_fa, FunctionalAggregation}, AppState }; +use crate::helpers::lca_helper::calculate_lca; +use crate::helpers::filters::protein_filter::ProteinFilter; +use crate::helpers::filters::proteome_filter::ProteomeFilter; +use crate::helpers::filters::taxa_filter::TaxaFilter; +use crate::helpers::filters::UniprotFilter; +use crate::helpers::lineage_helper::{get_lineage_array, LineageVersion}; use crate::helpers::sanitize_peptides; #[derive(Deserialize)] pub struct Parameters { #[serde(default)] peptides: Vec, + #[serde(flatten)] + filter: Filter, #[serde(default = "default_equate_il")] equate_il: bool, - #[serde(default = "default_include_fa")] - include_fa: bool, #[serde(default = "default_tryptic")] tryptic: bool } +#[derive(Deserialize)] +pub enum Filter { + #[serde(rename = "taxa")] + Taxa(HashSet), + #[serde(rename = "proteomes")] + Proteomes(HashSet), + #[serde(rename = "proteins")] + Proteins(HashSet) +} + #[derive(Serialize)] pub struct FilteredDataItem { sequence: String, - taxa: Vec, - #[serde(skip_serializing_if = "Option::is_none")] - fa: Option + lca: Option, + lineage: Vec>, + fa: FunctionalAggregation } #[derive(Serialize)] @@ -35,7 +52,7 @@ pub struct FilteredData { async fn handler( State(AppState { index, datastore, .. }): State, - Parameters { mut peptides, equate_il, include_fa, tryptic }: Parameters + Parameters { mut peptides, equate_il, tryptic, filter }: Parameters ) -> Result { if peptides.is_empty() { return Ok(FilteredData { peptides: Vec::new() }); @@ -45,30 +62,49 @@ async fn handler( peptides.dedup(); let peptides = sanitize_peptides(peptides); - let result = index.analyse(&peptides, equate_il, Some(10_000), Some(tryptic)); + let result = index.analyse(&peptides, equate_il, tryptic, Some(10_000)); let taxon_store = datastore.taxon_store(); + let lineage_store = datastore.lineage_store(); + + let filter_proteins: Box = match filter { + Filter::Taxa(taxa) => { + Box::new(TaxaFilter::new(taxa, lineage_store)) + }, + Filter::Proteomes(proteomes) => { + Box::new(ProteomeFilter::new(proteomes).await.unwrap()) + }, + Filter::Proteins(proteins) => { + Box::new(ProteinFilter::new(proteins)) + } + }; Ok(FilteredData { peptides: result .into_iter() - .filter_map(|item| { - let item_taxa: Vec = item.proteins.iter().map(|protein| protein.taxon).filter(|&taxon_id| taxon_store.is_valid(taxon_id)).collect(); + .filter_map(|SearchResult { proteins, sequence, .. }| { + let filtered_proteins: Vec = proteins + .into_iter() + .filter(|protein| filter_proteins.filter(protein)) + .collect(); - if item_taxa.is_empty() { + if filtered_proteins.is_empty() { return None; } - let fa = if include_fa { - Some(calculate_fa(&item.proteins)) - } else { - None - }; + let lca = calculate_lca( + filtered_proteins.iter().map(|protein| protein.taxon).collect(), + LineageVersion::V2, + taxon_store, + lineage_store + ); + let lineage = get_lineage_array(lca as u32, LineageVersion::V2, lineage_store); Some(FilteredDataItem { - sequence: item.sequence, - taxa: item_taxa, - fa + sequence, + lca: Some(lca as u32), + lineage, + fa: calculate_fa(&filtered_proteins) }) }) .collect() diff --git a/api/src/controllers/private_api/proteins.rs b/api/src/controllers/private_api/proteins.rs index 2a8ffd6..7790967 100644 --- a/api/src/controllers/private_api/proteins.rs +++ b/api/src/controllers/private_api/proteins.rs @@ -52,7 +52,7 @@ async fn handler( ) -> Result { let connection = database.get_conn().await?; - let result = index.analyse(&vec![peptide], equate_il, None, None); + let result = index.analyse(&vec![peptide], equate_il, false, None); if result.is_empty() { return Ok(ProteinInformation::default()); diff --git a/api/src/helpers/filters/mod.rs b/api/src/helpers/filters/mod.rs new file mode 100644 index 0000000..3fc710d --- /dev/null +++ b/api/src/helpers/filters/mod.rs @@ -0,0 +1,9 @@ +use index::ProteinInfo; + +pub mod taxa_filter; +pub mod proteome_filter; +pub mod protein_filter; + +pub trait UniprotFilter { + fn filter(&self, protein: &ProteinInfo) -> bool; +} diff --git a/api/src/helpers/filters/protein_filter.rs b/api/src/helpers/filters/protein_filter.rs new file mode 100644 index 0000000..d1f968e --- /dev/null +++ b/api/src/helpers/filters/protein_filter.rs @@ -0,0 +1,19 @@ +use std::collections::HashSet; +use index::ProteinInfo; +use crate::helpers::filters::UniprotFilter; + +pub struct ProteinFilter { + pub proteins: HashSet +} + +impl UniprotFilter for ProteinFilter { + fn filter(&self, protein: &ProteinInfo) -> bool { + self.proteins.contains(&protein.uniprot_accession) + } +} + +impl ProteinFilter { + pub fn new(proteins: HashSet) -> Self { + ProteinFilter { proteins } + } +} diff --git a/api/src/helpers/filters/proteome_filter.rs b/api/src/helpers/filters/proteome_filter.rs new file mode 100644 index 0000000..e0c8a96 --- /dev/null +++ b/api/src/helpers/filters/proteome_filter.rs @@ -0,0 +1,31 @@ +use std::collections::HashSet; +use index::ProteinInfo; +use crate::helpers::filters::UniprotFilter; + +pub struct ProteomeFilter { + pub proteins: HashSet +} + +impl UniprotFilter for ProteomeFilter { + fn filter(&self, protein: &ProteinInfo) -> bool { + self.proteins.contains(&protein.uniprot_accession) + } +} + +impl ProteomeFilter { + pub async fn new(proteomes: HashSet) -> reqwest::Result { + let mut proteins = HashSet::new(); + + for proteome in proteomes { + proteins.extend(fetch_proteome(proteome).await?); + } + + Ok(ProteomeFilter { proteins }) + } +} + +async fn fetch_proteome(proteome: String) -> reqwest::Result> { + let url = format!("https://rest.uniprot.org/uniprotkb/stream?fields=accession&format=list&query=(proteome:{})", proteome); + let proteins_string = reqwest::get(url).await?.text().await?; + Ok(proteins_string.lines().map(|line| line.to_string()).collect()) +} diff --git a/api/src/helpers/filters/taxa_filter.rs b/api/src/helpers/filters/taxa_filter.rs new file mode 100644 index 0000000..60878e6 --- /dev/null +++ b/api/src/helpers/filters/taxa_filter.rs @@ -0,0 +1,25 @@ +use std::collections::HashSet; +use datastore::LineageStore; +use index::ProteinInfo; +use crate::helpers::filters::UniprotFilter; +use crate::helpers::lineage_helper::{get_lineage_array, LineageVersion}; + +pub struct TaxaFilter<'a> { + pub taxa: HashSet, + lineage_store: &'a LineageStore +} + +impl UniprotFilter for TaxaFilter<'_> { + fn filter(&self, protein: &ProteinInfo) -> bool { + get_lineage_array(protein.taxon, LineageVersion::V2, self.lineage_store) + .iter() + .flatten() + .any(|ancestor| self.taxa.contains(&(ancestor.abs() as u32))) + } +} + +impl<'a> TaxaFilter<'a> { + pub fn new(taxa: HashSet, lineage_store: &'a LineageStore) -> Self { + TaxaFilter { taxa, lineage_store } + } +} diff --git a/api/src/helpers/mod.rs b/api/src/helpers/mod.rs index 8ec0ce2..b9ff9ad 100644 --- a/api/src/helpers/mod.rs +++ b/api/src/helpers/mod.rs @@ -5,6 +5,7 @@ pub mod interpro_helper; pub mod lca_helper; pub mod lineage_helper; pub mod tree_helper; +pub mod filters; fn is_zero(num: &u32) -> bool { *num == 0 diff --git a/database/Cargo.toml b/database/Cargo.toml index 2b03cf6..998478a 100644 --- a/database/Cargo.toml +++ b/database/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -deadpool-diesel = { version = "0.4.1", features = ["mysql"] } +deadpool-diesel = { version = "0.6.1", features = ["mysql"] } diesel = { version = "2", features = ["mysql"] } itertools = "0.13.0" thiserror = "1.0" diff --git a/datastore/src/lib.rs b/datastore/src/lib.rs index 7cee314..a5d4fde 100644 --- a/datastore/src/lib.rs +++ b/datastore/src/lib.rs @@ -35,7 +35,7 @@ impl DataStore { taxon_file: &str ) -> Result { let version = std::fs::read_to_string(version_file).map_err( - |err| DataStoreError::FileNotFound(version_file.to_string()) + |_| DataStoreError::FileNotFound(version_file.to_string()) )?; Ok(Self { version: version.trim_end().to_string(), diff --git a/datastore/src/taxon_store.rs b/datastore/src/taxon_store.rs index 3f9de73..2878aa7 100644 --- a/datastore/src/taxon_store.rs +++ b/datastore/src/taxon_store.rs @@ -51,7 +51,6 @@ impl fmt::Display for LineageRank { } } - pub struct TaxonStore { pub mapper: HashMap } diff --git a/index/src/lib.rs b/index/src/lib.rs index b8e9f0a..62d6ca0 100644 --- a/index/src/lib.rs +++ b/index/src/lib.rs @@ -7,9 +7,10 @@ pub use errors::IndexError; use errors::LoadIndexError; use sa_compression::load_compressed_suffix_array; pub use sa_index::peptide_search::ProteinInfo; +pub use sa_index::peptide_search::SearchResult; use sa_index::{ binary::load_suffix_array, - peptide_search::{search_all_peptides, SearchResult}, + peptide_search::{search_all_peptides}, sa_searcher::SparseSearcher, SuffixArray }; @@ -36,8 +37,8 @@ impl Index { Ok(Self { searcher }) } - pub fn analyse(&self, peptides: &Vec, equate_il: bool, cutoff: Option, tryptic: Option) -> Vec { - search_all_peptides(&self.searcher, peptides, cutoff.unwrap_or(10_000), equate_il, tryptic.unwrap_or(false)) + pub fn analyse(&self, peptides: &Vec, equate_il: bool, tryptic: bool, cutoff: Option) -> Vec { + search_all_peptides(&self.searcher, peptides, cutoff.unwrap_or(10_000), equate_il, tryptic) } }