Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filtering of UniProtKB to API #56

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ http = "1.1.0"
tower-layer = "0.3.2"
tower-service = "0.3.2"
itertools = "0.13.0"
reqwest = { version = "0.12.8", features = [ "json" ] }
4 changes: 4 additions & 0 deletions api/src/controllers/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ pub fn default_descendants_ranks() -> Vec<String> {
pub fn default_link() -> bool {
false
}

pub fn default_compact() -> bool {
false
}
2 changes: 1 addition & 1 deletion api/src/controllers/api/pept2ec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ async fn handler(
}

let unique_peptides: Vec<String> = peptide_counts.keys().cloned().collect();
let result = index.analyse(&unique_peptides, equate_il, None, None);
let result = index.analyse(&unique_peptides, equate_il, false, None);

let ec_store = datastore.ec_store();

Expand Down
2 changes: 1 addition & 1 deletion api/src/controllers/api/pept2funct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ async fn handler(
Parameters { input, equate_il, extra, domains }: Parameters
) -> Result<Vec<FunctInformation>, ()> {
let input = sanitize_peptides(input);
let result = index.analyse(&input, equate_il, None, None);
let result = index.analyse(&input, equate_il, false, None);

let ec_store = datastore.ec_store();
let go_store = datastore.go_store();
Expand Down
2 changes: 1 addition & 1 deletion api/src/controllers/api/pept2go.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async fn handler(
Parameters { input, equate_il, extra, domains }: Parameters
) -> Result<Vec<GoInformation>, ()> {
let input = sanitize_peptides(input);
let result = index.analyse(&input, equate_il, None, None);
let result = index.analyse(&input, equate_il, false, None);

let go_store = datastore.go_store();

Expand Down
2 changes: 1 addition & 1 deletion api/src/controllers/api/pept2interpro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async fn handler(
Parameters { input, equate_il, extra, domains }: Parameters
) -> Result<Vec<InterproInformation>, ()> {
let input = sanitize_peptides(input);
let result = index.analyse(&input, equate_il, None, None);
let result = index.analyse(&input, equate_il, false, None);

let interpro_store = datastore.interpro_store();

Expand Down
2 changes: 1 addition & 1 deletion api/src/controllers/api/pept2lca.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ async fn handler(
version: LineageVersion
) -> Result<Vec<LcaInformation>, ()> {
let input = sanitize_peptides(input);
let result = index.analyse(&input, equate_il, None, None);
let result = index.analyse(&input, equate_il, false, None);

let taxon_store = datastore.taxon_store();
let lineage_store = datastore.lineage_store();
Expand Down
3 changes: 1 addition & 2 deletions api/src/controllers/api/pept2prot.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::HashSet;
use axum::{extract::State, Json};
use itertools::Itertools;
use database::get_accessions_map;
use serde::{Deserialize, Serialize};

Expand Down Expand Up @@ -57,7 +56,7 @@ async fn handler(

let connection = database.get_conn().await?;

let result = index.analyse(&input, equate_il, None, Some(tryptic));
let result = index.analyse(&input, equate_il, tryptic, None);

let accession_numbers: HashSet<String> = result
.iter()
Expand Down
50 changes: 43 additions & 7 deletions api/src/controllers/api/pept2taxa.rs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are also gonna have to update the documentation of the API on the website. I think this is a configuration option that other people might also want to use. (I mean the addition of the compact parameter).

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};

use crate::{
controllers::{
api::{default_equate_il, default_extra, default_names},
api::{default_equate_il, default_extra, default_names, default_compact, default_tryptic},
generate_handlers
},
helpers::lineage_helper::{
Expand All @@ -25,18 +25,35 @@ pub struct Parameters {
#[serde(default = "default_extra")]
extra: bool,
#[serde(default = "default_names")]
names: bool
names: bool,
#[serde(default = "default_tryptic")]
tryptic: bool,
#[serde(default = "default_compact")]
compact: bool
}

#[derive(Serialize)]
pub struct TaxaInformation {
#[serde(untagged)]
pub enum TaxaInformation {
Dense (DenseTaxaInformation),
Compact (CompactTaxaInformation)
}

#[derive(Serialize)]
pub struct DenseTaxaInformation {
peptide: String,
#[serde(flatten)]
taxon: Taxon,
#[serde(flatten, skip_serializing_if = "Option::is_none")]
lineage: Option<Lineage>
}

#[derive(Serialize)]
pub struct CompactTaxaInformation {
peptide: String,
taxa: Vec<u32>
}

#[derive(Serialize)]
pub struct Taxon {
taxon_id: u32,
Expand All @@ -46,15 +63,34 @@ pub struct Taxon {

async fn handler(
State(AppState { index, datastore, .. }): State<AppState>,
Parameters { input, equate_il, extra, names }: Parameters,
Parameters { input, equate_il, extra, names, tryptic, compact }: Parameters,
version: LineageVersion
) -> Result<Vec<TaxaInformation>, ()> {
let input = sanitize_peptides(input);
let result = index.analyse(&input, equate_il, None, None);
let result = index.analyse(&input, equate_il, tryptic, None);

let taxon_store = datastore.taxon_store();
let lineage_store = datastore.lineage_store();

if compact {
return Ok(result
.into_iter()
.filter_map(|item| {
let item_taxa: Vec<u32> = item.proteins.iter().map(|protein| protein.taxon).filter(|&taxon_id| taxon_store.is_valid(taxon_id)).collect();

if item_taxa.is_empty() {
return None;
}

Some(TaxaInformation::Compact(CompactTaxaInformation {
peptide: item.sequence,
taxa: item_taxa,
}))
})
.collect()
)
}

Ok(result
.into_iter()
.flat_map(|item| {
Expand All @@ -67,15 +103,15 @@ async fn handler(
(false, _) => None
};

Some(TaxaInformation {
Some(TaxaInformation::Dense(DenseTaxaInformation {
peptide: item.sequence.clone(),
taxon: Taxon {
taxon_id: taxon,
taxon_name: name.to_string(),
taxon_rank: rank.clone().into()
},
lineage
})
}))
}
)
})
Expand Down
2 changes: 1 addition & 1 deletion api/src/controllers/api/peptinfo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ async fn handler(
version: LineageVersion
) -> Result<Vec<PeptInformation>, ()> {
let input = sanitize_peptides(input);
let result = index.analyse(&input, equate_il, None, None);
let result = index.analyse(&input, equate_il, false, None);

let ec_store = datastore.ec_store();
let go_store = datastore.go_store();
Expand Down
2 changes: 1 addition & 1 deletion api/src/controllers/mpa/pept2data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ async fn handler(
peptides.dedup();

let peptides = sanitize_peptides(peptides);
let result = index.analyse(&peptides, equate_il, Some(10_000), Some(tryptic));
let result = index.analyse(&peptides, equate_il, tryptic, Some(10_000));

let taxon_store = datastore.taxon_store();
let lineage_store = datastore.lineage_store();
Expand Down
76 changes: 56 additions & 20 deletions api/src/controllers/mpa/pept2filtered.rs
Original file line number Diff line number Diff line change
@@ -1,31 +1,48 @@
use std::collections::HashSet;
use axum::{extract::State, Json};
use serde::{Deserialize, Serialize};

use index::{ProteinInfo, SearchResult};
use crate::{
controllers::{generate_handlers, mpa::default_equate_il, mpa::default_include_fa, mpa::default_tryptic},
controllers::{generate_handlers, mpa::default_equate_il, mpa::default_tryptic},
helpers::fa_helper::{calculate_fa, FunctionalAggregation},
AppState
};
use crate::helpers::lca_helper::calculate_lca;
use crate::helpers::filters::protein_filter::ProteinFilter;
use crate::helpers::filters::proteome_filter::ProteomeFilter;
use crate::helpers::filters::taxa_filter::TaxaFilter;
use crate::helpers::filters::UniprotFilter;
use crate::helpers::lineage_helper::{get_lineage_array, LineageVersion};
use crate::helpers::sanitize_peptides;

#[derive(Deserialize)]
pub struct Parameters {
#[serde(default)]
peptides: Vec<String>,
#[serde(flatten)]
filter: Filter,
#[serde(default = "default_equate_il")]
equate_il: bool,
#[serde(default = "default_include_fa")]
include_fa: bool,
#[serde(default = "default_tryptic")]
tryptic: bool
}

#[derive(Deserialize)]
pub enum Filter {
#[serde(rename = "taxa")]
Taxa(HashSet<u32>),
#[serde(rename = "proteomes")]
Proteomes(HashSet<String>),
#[serde(rename = "proteins")]
Proteins(HashSet<String>)
}

#[derive(Serialize)]
pub struct FilteredDataItem {
sequence: String,
taxa: Vec<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
fa: Option<FunctionalAggregation>
lca: Option<u32>,
lineage: Vec<Option<i32>>,
fa: FunctionalAggregation
}

#[derive(Serialize)]
Expand All @@ -35,7 +52,7 @@ pub struct FilteredData {

async fn handler(
State(AppState { index, datastore, .. }): State<AppState>,
Parameters { mut peptides, equate_il, include_fa, tryptic }: Parameters
Parameters { mut peptides, equate_il, tryptic, filter }: Parameters
) -> Result<FilteredData, ()> {
if peptides.is_empty() {
return Ok(FilteredData { peptides: Vec::new() });
Expand All @@ -45,30 +62,49 @@ async fn handler(
peptides.dedup();

let peptides = sanitize_peptides(peptides);
let result = index.analyse(&peptides, equate_il, Some(10_000), Some(tryptic));
let result = index.analyse(&peptides, equate_il, tryptic, Some(10_000));

let taxon_store = datastore.taxon_store();
let lineage_store = datastore.lineage_store();

let filter_proteins: Box<dyn UniprotFilter> = match filter {
Filter::Taxa(taxa) => {
Box::new(TaxaFilter::new(taxa, lineage_store))
},
Filter::Proteomes(proteomes) => {
Box::new(ProteomeFilter::new(proteomes).await.unwrap())
},
Filter::Proteins(proteins) => {
Box::new(ProteinFilter::new(proteins))
}
};
Comment on lines +70 to +80
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should consider adding some kind of extra error handling. What happens if a user passes a reference proteome, UniProt accession or taxon ID that doesn't exist? Ideally, we should return a HTTP 422 status code in this case: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/422


Ok(FilteredData {
peptides: result
.into_iter()
.filter_map(|item| {
let item_taxa: Vec<u32> = item.proteins.iter().map(|protein| protein.taxon).filter(|&taxon_id| taxon_store.is_valid(taxon_id)).collect();
.filter_map(|SearchResult { proteins, sequence, .. }| {
let filtered_proteins: Vec<ProteinInfo> = proteins
.into_iter()
.filter(|protein| filter_proteins.filter(protein))
.collect();

if item_taxa.is_empty() {
if filtered_proteins.is_empty() {
return None;
}

let fa = if include_fa {
Some(calculate_fa(&item.proteins))
} else {
None
};
let lca = calculate_lca(
filtered_proteins.iter().map(|protein| protein.taxon).collect(),
LineageVersion::V2,
taxon_store,
lineage_store
);
let lineage = get_lineage_array(lca as u32, LineageVersion::V2, lineage_store);

Some(FilteredDataItem {
sequence: item.sequence,
taxa: item_taxa,
fa
sequence,
lca: Some(lca as u32),
lineage,
fa: calculate_fa(&filtered_proteins)
})
})
.collect()
Expand Down
2 changes: 1 addition & 1 deletion api/src/controllers/private_api/proteins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ async fn handler(
) -> Result<ProteinInformation, ApiError> {
let connection = database.get_conn().await?;

let result = index.analyse(&vec![peptide], equate_il, None, None);
let result = index.analyse(&vec![peptide], equate_il, false, None);

if result.is_empty() {
return Ok(ProteinInformation::default());
Expand Down
9 changes: 9 additions & 0 deletions api/src/helpers/filters/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use index::ProteinInfo;

pub mod taxa_filter;
pub mod proteome_filter;
pub mod protein_filter;

pub trait UniprotFilter {
fn filter(&self, protein: &ProteinInfo) -> bool;
}
19 changes: 19 additions & 0 deletions api/src/helpers/filters/protein_filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use std::collections::HashSet;
use index::ProteinInfo;
use crate::helpers::filters::UniprotFilter;

pub struct ProteinFilter {
pub proteins: HashSet<String>
}

impl UniprotFilter for ProteinFilter {
fn filter(&self, protein: &ProteinInfo) -> bool {
self.proteins.contains(&protein.uniprot_accession)
}
}

impl ProteinFilter {
pub fn new(proteins: HashSet<String>) -> Self {
ProteinFilter { proteins }
}
}
31 changes: 31 additions & 0 deletions api/src/helpers/filters/proteome_filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use std::collections::HashSet;
use index::ProteinInfo;
use crate::helpers::filters::UniprotFilter;

pub struct ProteomeFilter {
pub proteins: HashSet<String>
}

impl UniprotFilter for ProteomeFilter {
fn filter(&self, protein: &ProteinInfo) -> bool {
self.proteins.contains(&protein.uniprot_accession)
}
}

impl ProteomeFilter {
pub async fn new(proteomes: HashSet<String>) -> reqwest::Result<Self> {
let mut proteins = HashSet::new();

for proteome in proteomes {
proteins.extend(fetch_proteome(proteome).await?);
}

Ok(ProteomeFilter { proteins })
}
}

async fn fetch_proteome(proteome: String) -> reqwest::Result<HashSet<String>> {
let url = format!("https://rest.uniprot.org/uniprotkb/stream?fields=accession&format=list&query=(proteome:{})", proteome);
let proteins_string = reqwest::get(url).await?.text().await?;
Ok(proteins_string.lines().map(|line| line.to_string()).collect())
}
Loading