-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add filtering of UniProtKB to API #56
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,48 @@ | ||
use std::collections::HashSet; | ||
use axum::{extract::State, Json}; | ||
use serde::{Deserialize, Serialize}; | ||
|
||
use index::{ProteinInfo, SearchResult}; | ||
use crate::{ | ||
controllers::{generate_handlers, mpa::default_equate_il, mpa::default_include_fa, mpa::default_tryptic}, | ||
controllers::{generate_handlers, mpa::default_equate_il, mpa::default_tryptic}, | ||
helpers::fa_helper::{calculate_fa, FunctionalAggregation}, | ||
AppState | ||
}; | ||
use crate::helpers::lca_helper::calculate_lca; | ||
use crate::helpers::filters::protein_filter::ProteinFilter; | ||
use crate::helpers::filters::proteome_filter::ProteomeFilter; | ||
use crate::helpers::filters::taxa_filter::TaxaFilter; | ||
use crate::helpers::filters::UniprotFilter; | ||
use crate::helpers::lineage_helper::{get_lineage_array, LineageVersion}; | ||
use crate::helpers::sanitize_peptides; | ||
|
||
#[derive(Deserialize)] | ||
pub struct Parameters { | ||
#[serde(default)] | ||
peptides: Vec<String>, | ||
#[serde(flatten)] | ||
filter: Filter, | ||
#[serde(default = "default_equate_il")] | ||
equate_il: bool, | ||
#[serde(default = "default_include_fa")] | ||
include_fa: bool, | ||
#[serde(default = "default_tryptic")] | ||
tryptic: bool | ||
} | ||
|
||
#[derive(Deserialize)] | ||
pub enum Filter { | ||
#[serde(rename = "taxa")] | ||
Taxa(HashSet<u32>), | ||
#[serde(rename = "proteomes")] | ||
Proteomes(HashSet<String>), | ||
#[serde(rename = "proteins")] | ||
Proteins(HashSet<String>) | ||
} | ||
|
||
#[derive(Serialize)] | ||
pub struct FilteredDataItem { | ||
sequence: String, | ||
taxa: Vec<u32>, | ||
#[serde(skip_serializing_if = "Option::is_none")] | ||
fa: Option<FunctionalAggregation> | ||
lca: Option<u32>, | ||
lineage: Vec<Option<i32>>, | ||
fa: FunctionalAggregation | ||
} | ||
|
||
#[derive(Serialize)] | ||
|
@@ -35,7 +52,7 @@ pub struct FilteredData { | |
|
||
async fn handler( | ||
State(AppState { index, datastore, .. }): State<AppState>, | ||
Parameters { mut peptides, equate_il, include_fa, tryptic }: Parameters | ||
Parameters { mut peptides, equate_il, tryptic, filter }: Parameters | ||
) -> Result<FilteredData, ()> { | ||
if peptides.is_empty() { | ||
return Ok(FilteredData { peptides: Vec::new() }); | ||
|
@@ -45,30 +62,49 @@ async fn handler( | |
peptides.dedup(); | ||
|
||
let peptides = sanitize_peptides(peptides); | ||
let result = index.analyse(&peptides, equate_il, Some(10_000), Some(tryptic)); | ||
let result = index.analyse(&peptides, equate_il, tryptic, Some(10_000)); | ||
|
||
let taxon_store = datastore.taxon_store(); | ||
let lineage_store = datastore.lineage_store(); | ||
|
||
let filter_proteins: Box<dyn UniprotFilter> = match filter { | ||
Filter::Taxa(taxa) => { | ||
Box::new(TaxaFilter::new(taxa, lineage_store)) | ||
}, | ||
Filter::Proteomes(proteomes) => { | ||
Box::new(ProteomeFilter::new(proteomes).await.unwrap()) | ||
}, | ||
Filter::Proteins(proteins) => { | ||
Box::new(ProteinFilter::new(proteins)) | ||
} | ||
}; | ||
Comment on lines
+70
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should consider adding some kind of extra error handling. What happens if a user passes a reference proteome, UniProt accession or taxon ID that doesn't exist? Ideally, we should return a HTTP 422 status code in this case: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/422 |
||
|
||
Ok(FilteredData { | ||
peptides: result | ||
.into_iter() | ||
.filter_map(|item| { | ||
let item_taxa: Vec<u32> = item.proteins.iter().map(|protein| protein.taxon).filter(|&taxon_id| taxon_store.is_valid(taxon_id)).collect(); | ||
.filter_map(|SearchResult { proteins, sequence, .. }| { | ||
let filtered_proteins: Vec<ProteinInfo> = proteins | ||
.into_iter() | ||
.filter(|protein| filter_proteins.filter(protein)) | ||
.collect(); | ||
|
||
if item_taxa.is_empty() { | ||
if filtered_proteins.is_empty() { | ||
return None; | ||
} | ||
|
||
let fa = if include_fa { | ||
Some(calculate_fa(&item.proteins)) | ||
} else { | ||
None | ||
}; | ||
let lca = calculate_lca( | ||
filtered_proteins.iter().map(|protein| protein.taxon).collect(), | ||
LineageVersion::V2, | ||
taxon_store, | ||
lineage_store | ||
); | ||
let lineage = get_lineage_array(lca as u32, LineageVersion::V2, lineage_store); | ||
|
||
Some(FilteredDataItem { | ||
sequence: item.sequence, | ||
taxa: item_taxa, | ||
fa | ||
sequence, | ||
lca: Some(lca as u32), | ||
lineage, | ||
fa: calculate_fa(&filtered_proteins) | ||
}) | ||
}) | ||
.collect() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
use index::ProteinInfo; | ||
|
||
pub mod taxa_filter; | ||
pub mod proteome_filter; | ||
pub mod protein_filter; | ||
|
||
pub trait UniprotFilter { | ||
fn filter(&self, protein: &ProteinInfo) -> bool; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
use std::collections::HashSet; | ||
use index::ProteinInfo; | ||
use crate::helpers::filters::UniprotFilter; | ||
|
||
pub struct ProteinFilter { | ||
pub proteins: HashSet<String> | ||
} | ||
|
||
impl UniprotFilter for ProteinFilter { | ||
fn filter(&self, protein: &ProteinInfo) -> bool { | ||
self.proteins.contains(&protein.uniprot_accession) | ||
} | ||
} | ||
|
||
impl ProteinFilter { | ||
pub fn new(proteins: HashSet<String>) -> Self { | ||
ProteinFilter { proteins } | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
use std::collections::HashSet; | ||
use index::ProteinInfo; | ||
use crate::helpers::filters::UniprotFilter; | ||
|
||
pub struct ProteomeFilter { | ||
pub proteins: HashSet<String> | ||
} | ||
|
||
impl UniprotFilter for ProteomeFilter { | ||
fn filter(&self, protein: &ProteinInfo) -> bool { | ||
self.proteins.contains(&protein.uniprot_accession) | ||
} | ||
} | ||
|
||
impl ProteomeFilter { | ||
pub async fn new(proteomes: HashSet<String>) -> reqwest::Result<Self> { | ||
let mut proteins = HashSet::new(); | ||
|
||
for proteome in proteomes { | ||
proteins.extend(fetch_proteome(proteome).await?); | ||
} | ||
|
||
Ok(ProteomeFilter { proteins }) | ||
} | ||
} | ||
|
||
async fn fetch_proteome(proteome: String) -> reqwest::Result<HashSet<String>> { | ||
let url = format!("https://rest.uniprot.org/uniprotkb/stream?fields=accession&format=list&query=(proteome:{})", proteome); | ||
let proteins_string = reqwest::get(url).await?.text().await?; | ||
Ok(proteins_string.lines().map(|line| line.to_string()).collect()) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are also gonna have to update the documentation of the API on the website. I think this is a configuration option that other people might also want to use. (I mean the addition of the
compact
parameter).