From 5c6e2c901ba8ab02a34a490ba8c76d6c6ea96ba4 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 28 Aug 2024 11:34:05 +0200 Subject: [PATCH 1/9] use the k-mer optimizations --- index/Cargo.toml | 6 +++--- index/src/lib.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/index/Cargo.toml b/index/Cargo.toml index dc8bd9b..6401e85 100644 --- a/index/Cargo.toml +++ b/index/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -sa-compression = { git = "https://github.com/unipept/unipept-index.git" } -sa-index = { git = "https://github.com/unipept/unipept-index.git" } -sa-mappings = { git = "https://github.com/unipept/unipept-index.git" } +sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "352bcb797dcd6c369275aec178417bc14e6d2480" } +sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "352bcb797dcd6c369275aec178417bc14e6d2480" } +sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "352bcb797dcd6c369275aec178417bc14e6d2480" } thiserror = "1.0" diff --git a/index/src/lib.rs b/index/src/lib.rs index e304cbd..e5bb971 100644 --- a/index/src/lib.rs +++ b/index/src/lib.rs @@ -2,7 +2,7 @@ use std::{ fs::File, io::{BufReader, Read} }; - +use std::str::from_utf8; pub use errors::IndexError; use errors::LoadIndexError; use sa_compression::load_compressed_suffix_array; @@ -29,7 +29,7 @@ impl Index { let proteins = Proteins::try_from_database_file(proteins_file) .map_err(|err| LoadIndexError::LoadProteinsErrors(err.to_string()))?; - let searcher = SparseSearcher::new(suffix_array, proteins); + let searcher = SparseSearcher::new(suffix_array, proteins, 5); Ok(Self { searcher }) } From 13ca78d3345122d261c2796f19494882e666c20b Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 28 Aug 2024 13:55:53 +0200 Subject: [PATCH 2/9] Split the summarization of functional annotations --- Cargo.lock | 10 +-- api/src/controllers/api/pept2ec.rs | 3 +- api/src/controllers/api/pept2go.rs | 3 +- api/src/controllers/api/pept2interpro.rs | 3 +- api/src/helpers/fa_helper.rs | 81 ++++++++++++++++-------- 5 files changed, 67 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5eea72d..1fd71fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -232,7 +232,7 @@ dependencies = [ [[package]] name = "bitarray" version = "0.1.0" -source = "git+https://github.com/unipept/unipept-index.git#f15e3f8b11d5563096a35cdf33e090ea27570894" +source = "git+https://github.com/unipept/unipept-index.git?rev=352bcb797dcd6c369275aec178417bc14e6d2480#352bcb797dcd6c369275aec178417bc14e6d2480" [[package]] name = "bitflags" @@ -508,7 +508,7 @@ dependencies = [ [[package]] name = "fa-compression" version = "0.1.0" -source = "git+https://github.com/unipept/unipept-index.git#f15e3f8b11d5563096a35cdf33e090ea27570894" +source = "git+https://github.com/unipept/unipept-index.git?rev=352bcb797dcd6c369275aec178417bc14e6d2480#352bcb797dcd6c369275aec178417bc14e6d2480" [[package]] name = "fnv" @@ -1042,7 +1042,7 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "sa-compression" version = "0.1.0" -source = "git+https://github.com/unipept/unipept-index.git#f15e3f8b11d5563096a35cdf33e090ea27570894" +source = "git+https://github.com/unipept/unipept-index.git?rev=352bcb797dcd6c369275aec178417bc14e6d2480#352bcb797dcd6c369275aec178417bc14e6d2480" dependencies = [ "bitarray", "sa-index", @@ -1051,7 +1051,7 @@ dependencies = [ [[package]] name = "sa-index" version = "0.1.0" -source = "git+https://github.com/unipept/unipept-index.git#f15e3f8b11d5563096a35cdf33e090ea27570894" +source = "git+https://github.com/unipept/unipept-index.git?rev=352bcb797dcd6c369275aec178417bc14e6d2480#352bcb797dcd6c369275aec178417bc14e6d2480" dependencies = [ "bitarray", "clap", @@ -1064,7 +1064,7 @@ dependencies = [ [[package]] name = "sa-mappings" version = "0.1.0" -source = "git+https://github.com/unipept/unipept-index.git#f15e3f8b11d5563096a35cdf33e090ea27570894" +source = "git+https://github.com/unipept/unipept-index.git?rev=352bcb797dcd6c369275aec178417bc14e6d2480#352bcb797dcd6c369275aec178417bc14e6d2480" dependencies = [ "bytelines", "fa-compression", diff --git a/api/src/controllers/api/pept2ec.rs b/api/src/controllers/api/pept2ec.rs index 0992a09..7e3e176 100644 --- a/api/src/controllers/api/pept2ec.rs +++ b/api/src/controllers/api/pept2ec.rs @@ -13,6 +13,7 @@ use crate::{ }, AppState }; +use crate::helpers::fa_helper::calculate_ec; use crate::helpers::sanitize_peptides; #[derive(Deserialize)] @@ -52,7 +53,7 @@ async fn handler( let mut final_results = Vec::new(); for (unique_peptide, item) in unique_peptides.iter().zip(result.into_iter()) { if let Some(count) = peptide_counts.get(unique_peptide) { - let fa = calculate_fa(&item.proteins); + let fa = calculate_ec(&item.proteins); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); for _ in 0..*count { diff --git a/api/src/controllers/api/pept2go.rs b/api/src/controllers/api/pept2go.rs index cb6cf37..f20eb3c 100644 --- a/api/src/controllers/api/pept2go.rs +++ b/api/src/controllers/api/pept2go.rs @@ -12,6 +12,7 @@ use crate::{ }, AppState }; +use crate::helpers::fa_helper::calculate_go; use crate::helpers::sanitize_peptides; #[derive(Deserialize)] @@ -45,7 +46,7 @@ async fn handler( Ok(result .into_iter() .map(|item| { - let fa = calculate_fa(&item.proteins); + let fa = calculate_go(&item.proteins); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); let gos = go_terms_from_map(&fa.data, go_store, extra, domains); diff --git a/api/src/controllers/api/pept2interpro.rs b/api/src/controllers/api/pept2interpro.rs index fce1ec4..fe9de47 100644 --- a/api/src/controllers/api/pept2interpro.rs +++ b/api/src/controllers/api/pept2interpro.rs @@ -12,6 +12,7 @@ use crate::{ }, AppState }; +use crate::helpers::fa_helper::calculate_ipr; use crate::helpers::sanitize_peptides; #[derive(Deserialize)] @@ -45,7 +46,7 @@ async fn handler( Ok(result .into_iter() .map(|item| { - let fa = calculate_fa(&item.proteins); + let fa = calculate_ipr(&item.proteins); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); let iprs = interpro_entries_from_map(&fa.data, interpro_store, extra, domains); diff --git a/api/src/helpers/fa_helper.rs b/api/src/helpers/fa_helper.rs index b5bb18d..edf4cb8 100644 --- a/api/src/helpers/fa_helper.rs +++ b/api/src/helpers/fa_helper.rs @@ -12,42 +12,73 @@ pub struct FunctionalAggregation { pub data: HashMap } -pub fn calculate_fa(proteins: &[ProteinInfo]) -> FunctionalAggregation { - // Keep track of the proteins that have any annotation - let mut proteins_with_annotations: HashSet<&str> = HashSet::new(); - - // Keep track of the proteins that have a certain annotation - let mut proteins_with_ec: HashSet<&str> = HashSet::new(); - let mut proteins_with_go: HashSet<&str> = HashSet::new(); - let mut proteins_with_ipr: HashSet<&str> = HashSet::new(); - - // Keep track of the counts of the different annotations - let mut data: HashMap = HashMap::new(); +fn count_annotations( + proteins: &[ProteinInfo], + annotation_prefix: char +) -> (HashSet, HashMap) { + let mut proteins_with_annotation: HashSet = HashSet::new(); + let mut protein_data: HashMap = HashMap::new(); for protein in proteins.iter() { for annotation in protein.functional_annotations.split(';') { match annotation.chars().next() { - Some('E') => { - proteins_with_ec.insert(&protein.uniprot_accession); - proteins_with_annotations.insert(&protein.uniprot_accession); - } - Some('G') => { - proteins_with_go.insert(&protein.uniprot_accession); - proteins_with_annotations.insert(&protein.uniprot_accession); - } - Some('I') => { - proteins_with_ipr.insert(&protein.uniprot_accession); - proteins_with_annotations.insert(&protein.uniprot_accession); + Some(c) => { + if c == annotation_prefix { + proteins_with_annotation.insert(protein.uniprot_accession.clone()); + proteins_with_annotation.insert(protein.uniprot_accession.clone()); + protein_data.entry(annotation.to_string()).and_modify(|c| *c += 1).or_insert(1); + } } _ => {} }; - - data.entry(annotation.to_string()).and_modify(|c| *c += 1).or_insert(1); } } + (proteins_with_annotation, protein_data) +} + +pub fn calculate_ec(proteins: &[ProteinInfo]) -> FunctionalAggregation { + let (proteins_with_ec, ec_protein_data) = count_annotations(proteins, 'E'); + + let mut counts: HashMap = HashMap::new(); + counts.insert("all".to_string(), proteins_with_ec.len()); + + FunctionalAggregation { counts, data: ec_protein_data } +} + +pub fn calculate_go(proteins: &[ProteinInfo]) -> FunctionalAggregation { + let (proteins_with_go, go_protein_data) = count_annotations(proteins, 'G'); + + let mut counts: HashMap = HashMap::new(); + counts.insert("all".to_string(), proteins_with_go.len()); + + FunctionalAggregation { counts, data: go_protein_data } +} + +pub fn calculate_ipr(proteins: &[ProteinInfo]) -> FunctionalAggregation { + let (proteins_with_ipr, ipr_protein_data) = count_annotations(proteins, 'I'); + + let mut counts: HashMap = HashMap::new(); + counts.insert("all".to_string(), proteins_with_ipr.len()); + + FunctionalAggregation { counts, data: ipr_protein_data } +} + +pub fn calculate_fa(proteins: &[ProteinInfo]) -> FunctionalAggregation { + // Keep track of the proteins that have a certain annotation + let (proteins_with_ec, ec_protein_data) = count_annotations(proteins, 'E'); + let (proteins_with_go, go_protein_data) = count_annotations(proteins, 'G'); + let (proteins_with_ipr, ipr_protein_data) = count_annotations(proteins, 'I'); + + // Keep track of the counts of the different annotations + let mut data: HashMap = HashMap::new(); + + data.extend(ec_protein_data); + data.extend(go_protein_data); + data.extend(ipr_protein_data); + let mut counts: HashMap = HashMap::new(); - counts.insert("all".to_string(), proteins_with_annotations.len()); + counts.insert("all".to_string(), proteins_with_ec.len() + proteins_with_go.len() + proteins_with_ipr.len()); counts.insert("EC".to_string(), proteins_with_ec.len()); counts.insert("GO".to_string(), proteins_with_go.len()); counts.insert("IPR".to_string(), proteins_with_ipr.len()); From ddc76b1ac30805ce1147fa2dc044ad2b25ed5d17 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 28 Aug 2024 15:50:14 +0200 Subject: [PATCH 3/9] use the updated index. Only request the ec numbers when needed --- api/src/helpers/fa_helper.rs | 93 ++++++++++++++++++++++-------------- index/Cargo.toml | 6 +-- 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/api/src/helpers/fa_helper.rs b/api/src/helpers/fa_helper.rs index edf4cb8..5f51d99 100644 --- a/api/src/helpers/fa_helper.rs +++ b/api/src/helpers/fa_helper.rs @@ -12,73 +12,92 @@ pub struct FunctionalAggregation { pub data: HashMap } -fn count_annotations( - proteins: &[ProteinInfo], - annotation_prefix: char -) -> (HashSet, HashMap) { - let mut proteins_with_annotation: HashSet = HashSet::new(); - let mut protein_data: HashMap = HashMap::new(); +pub fn calculate_ec(proteins: &[ProteinInfo]) -> FunctionalAggregation { + let mut proteins_with_ec: HashSet<&str> = HashSet::new(); + + let mut data: HashMap = HashMap::new(); for protein in proteins.iter() { - for annotation in protein.functional_annotations.split(';') { - match annotation.chars().next() { - Some(c) => { - if c == annotation_prefix { - proteins_with_annotation.insert(protein.uniprot_accession.clone()); - proteins_with_annotation.insert(protein.uniprot_accession.clone()); - protein_data.entry(annotation.to_string()).and_modify(|c| *c += 1).or_insert(1); - } - } - _ => {} - }; + for ec_number in protein.ec_numbers.split(';') { + proteins_with_ec.insert(&protein.uniprot_accession); // TODO: outside of loop? + data.entry(ec_number.to_string()).and_modify(|c| *c += 1).or_insert(1); } } - (proteins_with_annotation, protein_data) -} - -pub fn calculate_ec(proteins: &[ProteinInfo]) -> FunctionalAggregation { - let (proteins_with_ec, ec_protein_data) = count_annotations(proteins, 'E'); - let mut counts: HashMap = HashMap::new(); counts.insert("all".to_string(), proteins_with_ec.len()); - FunctionalAggregation { counts, data: ec_protein_data } + FunctionalAggregation { counts, data } } pub fn calculate_go(proteins: &[ProteinInfo]) -> FunctionalAggregation { - let (proteins_with_go, go_protein_data) = count_annotations(proteins, 'G'); + let mut proteins_with_go: HashSet<&str> = HashSet::new(); + + let mut data: HashMap = HashMap::new(); + + for protein in proteins.iter() { + for go_term in protein.go_terms.split(';') { + proteins_with_go.insert(&protein.uniprot_accession); // TODO: outside of loop? + data.entry(go_term.to_string()).and_modify(|c| *c += 1).or_insert(1); + } + } let mut counts: HashMap = HashMap::new(); counts.insert("all".to_string(), proteins_with_go.len()); - FunctionalAggregation { counts, data: go_protein_data } + FunctionalAggregation { counts, data } } pub fn calculate_ipr(proteins: &[ProteinInfo]) -> FunctionalAggregation { - let (proteins_with_ipr, ipr_protein_data) = count_annotations(proteins, 'I'); + let mut proteins_with_ipr: HashSet<&str> = HashSet::new(); + + let mut data: HashMap = HashMap::new(); + + for protein in proteins.iter() { + for interpro_entry in protein.interpro_entries.split(';') { + proteins_with_ipr.insert(&protein.uniprot_accession); + data.entry(interpro_entry.to_string()).and_modify(|c| *c += 1).or_insert(1); + } + } let mut counts: HashMap = HashMap::new(); counts.insert("all".to_string(), proteins_with_ipr.len()); - FunctionalAggregation { counts, data: ipr_protein_data } + FunctionalAggregation { counts, data } } pub fn calculate_fa(proteins: &[ProteinInfo]) -> FunctionalAggregation { - // Keep track of the proteins that have a certain annotation - let (proteins_with_ec, ec_protein_data) = count_annotations(proteins, 'E'); - let (proteins_with_go, go_protein_data) = count_annotations(proteins, 'G'); - let (proteins_with_ipr, ipr_protein_data) = count_annotations(proteins, 'I'); + // Keep track of the proteins that have any annotation + let mut proteins_with_annotations: HashSet<&str> = HashSet::new(); + + let mut proteins_with_ec: HashSet<&str> = HashSet::new(); + let mut proteins_with_go: HashSet<&str> = HashSet::new(); + let mut proteins_with_ipr: HashSet<&str> = HashSet::new(); - // Keep track of the counts of the different annotations let mut data: HashMap = HashMap::new(); - data.extend(ec_protein_data); - data.extend(go_protein_data); - data.extend(ipr_protein_data); + for protein in proteins.iter() { + for ec_number in protein.ec_numbers.split(';') { + proteins_with_ec.insert(&protein.uniprot_accession); + proteins_with_annotations.insert(&protein.uniprot_accession); + data.entry(ec_number.to_string()).and_modify(|c| *c += 1).or_insert(1); + } + + for go_term in protein.go_terms.split(';') { + proteins_with_go.insert(&protein.uniprot_accession); + proteins_with_annotations.insert(&protein.uniprot_accession); + data.entry(go_term.to_string()).and_modify(|c| *c += 1).or_insert(1); + } + + for interpro_entry in protein.interpro_entries.split(';') { + proteins_with_ipr.insert(&protein.uniprot_accession); + proteins_with_annotations.insert(&protein.uniprot_accession); + data.entry(interpro_entry.to_string()).and_modify(|c| *c += 1).or_insert(1); + } + } let mut counts: HashMap = HashMap::new(); - counts.insert("all".to_string(), proteins_with_ec.len() + proteins_with_go.len() + proteins_with_ipr.len()); + counts.insert("all".to_string(), proteins_with_annotations.len()); counts.insert("EC".to_string(), proteins_with_ec.len()); counts.insert("GO".to_string(), proteins_with_go.len()); counts.insert("IPR".to_string(), proteins_with_ipr.len()); diff --git a/index/Cargo.toml b/index/Cargo.toml index 6401e85..84e540b 100644 --- a/index/Cargo.toml +++ b/index/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "352bcb797dcd6c369275aec178417bc14e6d2480" } -sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "352bcb797dcd6c369275aec178417bc14e6d2480" } -sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "352bcb797dcd6c369275aec178417bc14e6d2480" } +sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "0bdd1f2c70814a61106628ef5de7d745df576ce5" } +sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "0bdd1f2c70814a61106628ef5de7d745df576ce5" } +sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "0bdd1f2c70814a61106628ef5de7d745df576ce5" } thiserror = "1.0" From 5f0d45c9498b861d55bda3a3f3e76c66a9e6add9 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Wed, 28 Aug 2024 20:55:26 +0200 Subject: [PATCH 4/9] use references of the proteins, rather than cloning them --- Cargo.toml | 2 +- api/src/controllers/api/pept2lca.rs | 2 +- api/src/controllers/api/pept2prot.rs | 8 ++-- api/src/controllers/api/pept2taxa.rs | 2 +- api/src/controllers/api/peptinfo.rs | 2 +- api/src/controllers/mpa/pept2data.rs | 2 +- api/src/controllers/mpa/pept2filtered.rs | 2 +- api/src/controllers/private_api/proteins.rs | 8 ++-- api/src/helpers/fa_helper.rs | 48 ++++++++++----------- index/Cargo.toml | 6 +-- index/src/lib.rs | 1 + 11 files changed, 42 insertions(+), 41 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c7da882..4b8e664 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ resolver = "2" members = [ - "api", + "api", "database", "datastore", "index" diff --git a/api/src/controllers/api/pept2lca.rs b/api/src/controllers/api/pept2lca.rs index ae21a60..0181a03 100644 --- a/api/src/controllers/api/pept2lca.rs +++ b/api/src/controllers/api/pept2lca.rs @@ -60,7 +60,7 @@ async fn handler( .into_iter() .filter_map(|item| { let lca = calculate_lca( - item.proteins.iter().map(|protein| protein.taxon).collect(), + item.proteins.iter().map(|protein| protein.taxon_id).collect(), version, taxon_store, lineage_store diff --git a/api/src/controllers/api/pept2prot.rs b/api/src/controllers/api/pept2prot.rs index 78bb50f..aa16fc2 100644 --- a/api/src/controllers/api/pept2prot.rs +++ b/api/src/controllers/api/pept2prot.rs @@ -57,7 +57,7 @@ async fn handler( let accession_numbers: Vec = result .iter() - .flat_map(|item| item.proteins.iter().map(|protein| protein.uniprot_accession.clone())) + .flat_map(|item| item.proteins.iter().map(|protein| protein.uniprot_id.clone())) .collect(); let accessions_map = connection.interact(move |conn| get_accessions_map(conn, &accession_numbers)).await??; @@ -70,7 +70,7 @@ async fn handler( item.proteins .into_iter() .filter_map(|protein| { - let uniprot_entry = accessions_map.get(&protein.uniprot_accession)?; + let uniprot_entry = accessions_map.get(&protein.uniprot_id)?; if extra { let taxon_name = taxon_store.get_name(uniprot_entry.taxon_id)?; @@ -97,7 +97,7 @@ async fn handler( Some(ProtInformation::Extra { peptide: item.sequence.clone(), - uniprot_id: protein.uniprot_accession.clone(), + uniprot_id: protein.uniprot_id.clone(), protein_name: uniprot_entry.name.clone(), taxon_id: uniprot_entry.taxon_id, taxon_name: taxon_name.clone(), @@ -109,7 +109,7 @@ async fn handler( } else { Some(ProtInformation::Default { peptide: item.sequence.clone(), - uniprot_id: protein.uniprot_accession.clone(), + uniprot_id: protein.uniprot_id.clone(), protein_name: uniprot_entry.name.clone(), taxon_id: uniprot_entry.taxon_id, protein: uniprot_entry.protein.clone() diff --git a/api/src/controllers/api/pept2taxa.rs b/api/src/controllers/api/pept2taxa.rs index 72c0943..cd62690 100644 --- a/api/src/controllers/api/pept2taxa.rs +++ b/api/src/controllers/api/pept2taxa.rs @@ -58,7 +58,7 @@ async fn handler( Ok(result .into_iter() .flat_map(|item| { - item.proteins.iter().map(|protein| protein.taxon).collect::>().into_iter().filter_map( + item.proteins.iter().map(|protein| protein.taxon_id).collect::>().into_iter().filter_map( move |taxon| { let (name, rank, _) = taxon_store.get(taxon)?; let lineage = match (extra, names) { diff --git a/api/src/controllers/api/peptinfo.rs b/api/src/controllers/api/peptinfo.rs index 7b759f6..110dfc2 100644 --- a/api/src/controllers/api/peptinfo.rs +++ b/api/src/controllers/api/peptinfo.rs @@ -80,7 +80,7 @@ async fn handler( let iprs = interpro_entries_from_map(&fa.data, interpro_store, extra, domains); let lca = calculate_lca( - item.proteins.iter().map(|protein| protein.taxon).collect(), + item.proteins.iter().map(|protein| protein.taxon_id).collect(), version, taxon_store, lineage_store diff --git a/api/src/controllers/mpa/pept2data.rs b/api/src/controllers/mpa/pept2data.rs index 4d4205d..8dceed8 100644 --- a/api/src/controllers/mpa/pept2data.rs +++ b/api/src/controllers/mpa/pept2data.rs @@ -55,7 +55,7 @@ async fn handler( .into_iter() .map(|item| { let lca = calculate_lca( - item.proteins.iter().map(|protein| protein.taxon).collect(), + item.proteins.iter().map(|protein| protein.taxon_id).collect(), LineageVersion::V2, taxon_store, lineage_store diff --git a/api/src/controllers/mpa/pept2filtered.rs b/api/src/controllers/mpa/pept2filtered.rs index 9bf785d..787d42f 100644 --- a/api/src/controllers/mpa/pept2filtered.rs +++ b/api/src/controllers/mpa/pept2filtered.rs @@ -49,7 +49,7 @@ async fn handler( peptides: result .into_iter() .filter_map(|item| { - let item_taxa: Vec = item.proteins.iter().map(|protein| protein.taxon).collect(); + let item_taxa: Vec = item.proteins.iter().map(|protein| protein.taxon_id).collect(); if item_taxa.is_empty() { return None; diff --git a/api/src/controllers/private_api/proteins.rs b/api/src/controllers/private_api/proteins.rs index 750ce1d..6d87b4b 100644 --- a/api/src/controllers/private_api/proteins.rs +++ b/api/src/controllers/private_api/proteins.rs @@ -58,14 +58,14 @@ async fn handler( } let accession_numbers: Vec = - result[0].proteins.iter().map(|protein| protein.uniprot_accession.clone()).collect(); + result[0].proteins.iter().map(|protein| protein.uniprot_id.clone()).collect(); let accessions_map = connection.interact(move |conn| get_accessions_map(conn, &accession_numbers)).await??; let taxon_store = datastore.taxon_store(); let lineage_store = datastore.lineage_store(); - let taxa = result[0].proteins.iter().map(|protein| protein.taxon).collect(); + let taxa = result[0].proteins.iter().map(|protein| protein.taxon_id).collect(); let lca = calculate_lca(taxa, LineageVersion::V2, taxon_store, lineage_store); let common_lineage = get_lineage_array(lca as u32, LineageVersion::V2, lineage_store) @@ -80,7 +80,7 @@ async fn handler( .proteins .iter() .filter_map(|protein| { - let uniprot_entry = accessions_map.get(&protein.uniprot_accession)?; + let uniprot_entry = accessions_map.get(&protein.uniprot_id)?; let fa: Vec<&str> = uniprot_entry.fa.split(';').collect(); let ec_numbers = @@ -94,7 +94,7 @@ async fn handler( .collect::>(); Some(Protein { - uniprot_accession_id: protein.uniprot_accession.clone(), + uniprot_accession_id: protein.uniprot_id.clone(), name: uniprot_entry.name.clone(), organism: uniprot_entry.taxon_id, ec_numbers, diff --git a/api/src/helpers/fa_helper.rs b/api/src/helpers/fa_helper.rs index 5f51d99..dc186e5 100644 --- a/api/src/helpers/fa_helper.rs +++ b/api/src/helpers/fa_helper.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, HashSet}; -use index::ProteinInfo; use serde::Serialize; +use index::Protein; /// A struct that represents the functional annotations once aggregated #[derive(Debug, Serialize)] @@ -12,14 +12,14 @@ pub struct FunctionalAggregation { pub data: HashMap } -pub fn calculate_ec(proteins: &[ProteinInfo]) -> FunctionalAggregation { +pub fn calculate_ec(proteins: &[&Protein]) -> FunctionalAggregation { let mut proteins_with_ec: HashSet<&str> = HashSet::new(); let mut data: HashMap = HashMap::new(); - for protein in proteins.iter() { - for ec_number in protein.ec_numbers.split(';') { - proteins_with_ec.insert(&protein.uniprot_accession); // TODO: outside of loop? + for &protein in proteins.iter() { + for ec_number in protein.get_ec_numbers().split(';') { + proteins_with_ec.insert(&protein.uniprot_id); // TODO: outside of loop? data.entry(ec_number.to_string()).and_modify(|c| *c += 1).or_insert(1); } } @@ -30,14 +30,14 @@ pub fn calculate_ec(proteins: &[ProteinInfo]) -> FunctionalAggregation { FunctionalAggregation { counts, data } } -pub fn calculate_go(proteins: &[ProteinInfo]) -> FunctionalAggregation { +pub fn calculate_go(proteins: &[&Protein]) -> FunctionalAggregation { let mut proteins_with_go: HashSet<&str> = HashSet::new(); let mut data: HashMap = HashMap::new(); - for protein in proteins.iter() { - for go_term in protein.go_terms.split(';') { - proteins_with_go.insert(&protein.uniprot_accession); // TODO: outside of loop? + for &protein in proteins.iter() { + for go_term in protein.get_go_terms().split(';') { + proteins_with_go.insert(&protein.uniprot_id); // TODO: outside of loop? data.entry(go_term.to_string()).and_modify(|c| *c += 1).or_insert(1); } } @@ -48,14 +48,14 @@ pub fn calculate_go(proteins: &[ProteinInfo]) -> FunctionalAggregation { FunctionalAggregation { counts, data } } -pub fn calculate_ipr(proteins: &[ProteinInfo]) -> FunctionalAggregation { +pub fn calculate_ipr(proteins: &[&Protein]) -> FunctionalAggregation { let mut proteins_with_ipr: HashSet<&str> = HashSet::new(); let mut data: HashMap = HashMap::new(); - for protein in proteins.iter() { - for interpro_entry in protein.interpro_entries.split(';') { - proteins_with_ipr.insert(&protein.uniprot_accession); + for &protein in proteins.iter() { + for interpro_entry in protein.get_interpro_entries().split(';') { + proteins_with_ipr.insert(&protein.uniprot_id); data.entry(interpro_entry.to_string()).and_modify(|c| *c += 1).or_insert(1); } } @@ -66,7 +66,7 @@ pub fn calculate_ipr(proteins: &[ProteinInfo]) -> FunctionalAggregation { FunctionalAggregation { counts, data } } -pub fn calculate_fa(proteins: &[ProteinInfo]) -> FunctionalAggregation { +pub fn calculate_fa(proteins: &[&Protein]) -> FunctionalAggregation { // Keep track of the proteins that have any annotation let mut proteins_with_annotations: HashSet<&str> = HashSet::new(); @@ -76,22 +76,22 @@ pub fn calculate_fa(proteins: &[ProteinInfo]) -> FunctionalAggregation { let mut data: HashMap = HashMap::new(); - for protein in proteins.iter() { - for ec_number in protein.ec_numbers.split(';') { - proteins_with_ec.insert(&protein.uniprot_accession); - proteins_with_annotations.insert(&protein.uniprot_accession); + for &protein in proteins.iter() { + for ec_number in protein.get_ec_numbers().split(';') { + proteins_with_ec.insert(&protein.uniprot_id); + proteins_with_annotations.insert(&protein.uniprot_id); data.entry(ec_number.to_string()).and_modify(|c| *c += 1).or_insert(1); } - for go_term in protein.go_terms.split(';') { - proteins_with_go.insert(&protein.uniprot_accession); - proteins_with_annotations.insert(&protein.uniprot_accession); + for go_term in protein.get_go_terms().split(';') { + proteins_with_go.insert(&protein.uniprot_id); + proteins_with_annotations.insert(&protein.uniprot_id); data.entry(go_term.to_string()).and_modify(|c| *c += 1).or_insert(1); } - for interpro_entry in protein.interpro_entries.split(';') { - proteins_with_ipr.insert(&protein.uniprot_accession); - proteins_with_annotations.insert(&protein.uniprot_accession); + for interpro_entry in protein.get_interpro_entries().split(';') { + proteins_with_ipr.insert(&protein.uniprot_id); + proteins_with_annotations.insert(&protein.uniprot_id); data.entry(interpro_entry.to_string()).and_modify(|c| *c += 1).or_insert(1); } } diff --git a/index/Cargo.toml b/index/Cargo.toml index 84e540b..5e85b19 100644 --- a/index/Cargo.toml +++ b/index/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "0bdd1f2c70814a61106628ef5de7d745df576ce5" } -sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "0bdd1f2c70814a61106628ef5de7d745df576ce5" } -sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "0bdd1f2c70814a61106628ef5de7d745df576ce5" } +sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "cad3fb5967d28e8d2035a6dbd6aaf704bfaf7ff0" } +sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "cad3fb5967d28e8d2035a6dbd6aaf704bfaf7ff0" } +sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "cad3fb5967d28e8d2035a6dbd6aaf704bfaf7ff0" } thiserror = "1.0" diff --git a/index/src/lib.rs b/index/src/lib.rs index 60f3dcf..49551a4 100644 --- a/index/src/lib.rs +++ b/index/src/lib.rs @@ -7,6 +7,7 @@ pub use errors::IndexError; use errors::LoadIndexError; use sa_compression::load_compressed_suffix_array; pub use sa_index::peptide_search::ProteinInfo; +pub use sa_mappings::proteins::Protein; use sa_index::{ binary::load_suffix_array, peptide_search::{search_all_peptides, SearchResult}, From 79d66b3433e453fd8dd861856e3235477ded3647 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Thu, 29 Aug 2024 15:30:40 +0200 Subject: [PATCH 5/9] No more allocations to store the protein references --- api/src/controllers/api/pept2ec.rs | 2 +- api/src/controllers/api/pept2funct.rs | 2 +- api/src/controllers/api/pept2go.rs | 2 +- api/src/controllers/api/pept2interpro.rs | 2 +- api/src/controllers/api/pept2lca.rs | 2 +- api/src/controllers/api/pept2prot.rs | 4 ++-- api/src/controllers/api/pept2taxa.rs | 2 +- api/src/controllers/api/peptinfo.rs | 4 ++-- api/src/controllers/mpa/pept2data.rs | 5 +++-- api/src/controllers/mpa/pept2filtered.rs | 4 ++-- api/src/controllers/private_api/proteins.rs | 7 +++---- api/src/helpers/fa_helper.rs | 18 +++++++++--------- index/Cargo.toml | 6 +++--- index/src/lib.rs | 4 ++-- 14 files changed, 32 insertions(+), 32 deletions(-) diff --git a/api/src/controllers/api/pept2ec.rs b/api/src/controllers/api/pept2ec.rs index 7e3e176..b2182b3 100644 --- a/api/src/controllers/api/pept2ec.rs +++ b/api/src/controllers/api/pept2ec.rs @@ -53,7 +53,7 @@ async fn handler( let mut final_results = Vec::new(); for (unique_peptide, item) in unique_peptides.iter().zip(result.into_iter()) { if let Some(count) = peptide_counts.get(unique_peptide) { - let fa = calculate_ec(&item.proteins); + let fa = calculate_ec(item.proteins(&index.searcher)); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); for _ in 0..*count { diff --git a/api/src/controllers/api/pept2funct.rs b/api/src/controllers/api/pept2funct.rs index ac3f68a..d47eaa3 100644 --- a/api/src/controllers/api/pept2funct.rs +++ b/api/src/controllers/api/pept2funct.rs @@ -51,7 +51,7 @@ async fn handler( Ok(result .into_iter() .map(|item| { - let fa = calculate_fa(&item.proteins); + let fa = calculate_fa(item.proteins(&index.searcher)); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); let ecs = ec_numbers_from_map(&fa.data, ec_store, extra); diff --git a/api/src/controllers/api/pept2go.rs b/api/src/controllers/api/pept2go.rs index f20eb3c..f4c2ed1 100644 --- a/api/src/controllers/api/pept2go.rs +++ b/api/src/controllers/api/pept2go.rs @@ -46,7 +46,7 @@ async fn handler( Ok(result .into_iter() .map(|item| { - let fa = calculate_go(&item.proteins); + let fa = calculate_go(item.proteins(&index.searcher)); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); let gos = go_terms_from_map(&fa.data, go_store, extra, domains); diff --git a/api/src/controllers/api/pept2interpro.rs b/api/src/controllers/api/pept2interpro.rs index fe9de47..9af5932 100644 --- a/api/src/controllers/api/pept2interpro.rs +++ b/api/src/controllers/api/pept2interpro.rs @@ -46,7 +46,7 @@ async fn handler( Ok(result .into_iter() .map(|item| { - let fa = calculate_ipr(&item.proteins); + let fa = calculate_ipr(item.proteins(&index.searcher)); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); let iprs = interpro_entries_from_map(&fa.data, interpro_store, extra, domains); diff --git a/api/src/controllers/api/pept2lca.rs b/api/src/controllers/api/pept2lca.rs index 0181a03..6b92754 100644 --- a/api/src/controllers/api/pept2lca.rs +++ b/api/src/controllers/api/pept2lca.rs @@ -60,7 +60,7 @@ async fn handler( .into_iter() .filter_map(|item| { let lca = calculate_lca( - item.proteins.iter().map(|protein| protein.taxon_id).collect(), + item.proteins(&index.searcher).map(|protein| protein.taxon_id).collect(), version, taxon_store, lineage_store diff --git a/api/src/controllers/api/pept2prot.rs b/api/src/controllers/api/pept2prot.rs index aa16fc2..d0b64bc 100644 --- a/api/src/controllers/api/pept2prot.rs +++ b/api/src/controllers/api/pept2prot.rs @@ -57,7 +57,7 @@ async fn handler( let accession_numbers: Vec = result .iter() - .flat_map(|item| item.proteins.iter().map(|protein| protein.uniprot_id.clone())) + .flat_map(|item| item.proteins(&index.searcher).map(|protein| protein.uniprot_id.clone())) .collect(); let accessions_map = connection.interact(move |conn| get_accessions_map(conn, &accession_numbers)).await??; @@ -67,7 +67,7 @@ async fn handler( Ok(result .into_iter() .flat_map(|item| { - item.proteins + item.proteins(&index.searcher) .into_iter() .filter_map(|protein| { let uniprot_entry = accessions_map.get(&protein.uniprot_id)?; diff --git a/api/src/controllers/api/pept2taxa.rs b/api/src/controllers/api/pept2taxa.rs index cd62690..7347622 100644 --- a/api/src/controllers/api/pept2taxa.rs +++ b/api/src/controllers/api/pept2taxa.rs @@ -58,7 +58,7 @@ async fn handler( Ok(result .into_iter() .flat_map(|item| { - item.proteins.iter().map(|protein| protein.taxon_id).collect::>().into_iter().filter_map( + item.proteins(&index.searcher).map(|protein| protein.taxon_id).collect::>().into_iter().filter_map( move |taxon| { let (name, rank, _) = taxon_store.get(taxon)?; let lineage = match (extra, names) { diff --git a/api/src/controllers/api/peptinfo.rs b/api/src/controllers/api/peptinfo.rs index 110dfc2..5ff71ca 100644 --- a/api/src/controllers/api/peptinfo.rs +++ b/api/src/controllers/api/peptinfo.rs @@ -72,7 +72,7 @@ async fn handler( Ok(result .into_iter() .filter_map(|item| { - let fa = calculate_fa(&item.proteins); + let fa = calculate_fa(item.proteins(&index.searcher)); let total_protein_count = *fa.counts.get("all").unwrap_or(&0); let ecs = ec_numbers_from_map(&fa.data, ec_store, extra); @@ -80,7 +80,7 @@ async fn handler( let iprs = interpro_entries_from_map(&fa.data, interpro_store, extra, domains); let lca = calculate_lca( - item.proteins.iter().map(|protein| protein.taxon_id).collect(), + item.proteins(&index.searcher).map(|protein| protein.taxon_id).collect(), version, taxon_store, lineage_store diff --git a/api/src/controllers/mpa/pept2data.rs b/api/src/controllers/mpa/pept2data.rs index 8dceed8..1b45636 100644 --- a/api/src/controllers/mpa/pept2data.rs +++ b/api/src/controllers/mpa/pept2data.rs @@ -55,18 +55,19 @@ async fn handler( .into_iter() .map(|item| { let lca = calculate_lca( - item.proteins.iter().map(|protein| protein.taxon_id).collect(), + item.proteins(&index.searcher).map(|protein| protein.taxon_id).collect(), LineageVersion::V2, taxon_store, lineage_store ); let lineage = get_lineage_array(lca as u32, LineageVersion::V2, lineage_store); + let fa = calculate_fa(item.proteins(&index.searcher)); DataItem { sequence: item.sequence, lca: Some(lca as u32), lineage, - fa: calculate_fa(&item.proteins) + fa } }) .collect() diff --git a/api/src/controllers/mpa/pept2filtered.rs b/api/src/controllers/mpa/pept2filtered.rs index 787d42f..7a10eac 100644 --- a/api/src/controllers/mpa/pept2filtered.rs +++ b/api/src/controllers/mpa/pept2filtered.rs @@ -49,14 +49,14 @@ async fn handler( peptides: result .into_iter() .filter_map(|item| { - let item_taxa: Vec = item.proteins.iter().map(|protein| protein.taxon_id).collect(); + let item_taxa: Vec = item.proteins(&index.searcher).map(|protein| protein.taxon_id).collect(); if item_taxa.is_empty() { return None; } let fa = if include_fa { - Some(calculate_fa(&item.proteins)) + Some(calculate_fa(item.proteins(&index.searcher))) } else { None }; diff --git a/api/src/controllers/private_api/proteins.rs b/api/src/controllers/private_api/proteins.rs index 6d87b4b..234e772 100644 --- a/api/src/controllers/private_api/proteins.rs +++ b/api/src/controllers/private_api/proteins.rs @@ -58,14 +58,14 @@ async fn handler( } let accession_numbers: Vec = - result[0].proteins.iter().map(|protein| protein.uniprot_id.clone()).collect(); + result[0].proteins(&index.searcher).map(|protein| protein.uniprot_id.clone()).collect(); let accessions_map = connection.interact(move |conn| get_accessions_map(conn, &accession_numbers)).await??; let taxon_store = datastore.taxon_store(); let lineage_store = datastore.lineage_store(); - let taxa = result[0].proteins.iter().map(|protein| protein.taxon_id).collect(); + let taxa = result[0].proteins(&index.searcher).map(|protein| protein.taxon_id).collect(); let lca = calculate_lca(taxa, LineageVersion::V2, taxon_store, lineage_store); let common_lineage = get_lineage_array(lca as u32, LineageVersion::V2, lineage_store) @@ -77,8 +77,7 @@ async fn handler( lca, common_lineage, proteins: result[0] - .proteins - .iter() + .proteins(&index.searcher) .filter_map(|protein| { let uniprot_entry = accessions_map.get(&protein.uniprot_id)?; diff --git a/api/src/helpers/fa_helper.rs b/api/src/helpers/fa_helper.rs index dc186e5..8a511e5 100644 --- a/api/src/helpers/fa_helper.rs +++ b/api/src/helpers/fa_helper.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, HashSet}; use serde::Serialize; -use index::Protein; +use index::{Protein, ProteinsIterator}; /// A struct that represents the functional annotations once aggregated #[derive(Debug, Serialize)] @@ -12,12 +12,12 @@ pub struct FunctionalAggregation { pub data: HashMap } -pub fn calculate_ec(proteins: &[&Protein]) -> FunctionalAggregation { +pub fn calculate_ec(proteins: ProteinsIterator) -> FunctionalAggregation { let mut proteins_with_ec: HashSet<&str> = HashSet::new(); let mut data: HashMap = HashMap::new(); - for &protein in proteins.iter() { + for protein in proteins { for ec_number in protein.get_ec_numbers().split(';') { proteins_with_ec.insert(&protein.uniprot_id); // TODO: outside of loop? data.entry(ec_number.to_string()).and_modify(|c| *c += 1).or_insert(1); @@ -30,12 +30,12 @@ pub fn calculate_ec(proteins: &[&Protein]) -> FunctionalAggregation { FunctionalAggregation { counts, data } } -pub fn calculate_go(proteins: &[&Protein]) -> FunctionalAggregation { +pub fn calculate_go(proteins: ProteinsIterator) -> FunctionalAggregation { let mut proteins_with_go: HashSet<&str> = HashSet::new(); let mut data: HashMap = HashMap::new(); - for &protein in proteins.iter() { + for protein in proteins { for go_term in protein.get_go_terms().split(';') { proteins_with_go.insert(&protein.uniprot_id); // TODO: outside of loop? data.entry(go_term.to_string()).and_modify(|c| *c += 1).or_insert(1); @@ -48,12 +48,12 @@ pub fn calculate_go(proteins: &[&Protein]) -> FunctionalAggregation { FunctionalAggregation { counts, data } } -pub fn calculate_ipr(proteins: &[&Protein]) -> FunctionalAggregation { +pub fn calculate_ipr(proteins: ProteinsIterator) -> FunctionalAggregation { let mut proteins_with_ipr: HashSet<&str> = HashSet::new(); let mut data: HashMap = HashMap::new(); - for &protein in proteins.iter() { + for protein in proteins { for interpro_entry in protein.get_interpro_entries().split(';') { proteins_with_ipr.insert(&protein.uniprot_id); data.entry(interpro_entry.to_string()).and_modify(|c| *c += 1).or_insert(1); @@ -66,7 +66,7 @@ pub fn calculate_ipr(proteins: &[&Protein]) -> FunctionalAggregation { FunctionalAggregation { counts, data } } -pub fn calculate_fa(proteins: &[&Protein]) -> FunctionalAggregation { +pub fn calculate_fa(proteins: ProteinsIterator) -> FunctionalAggregation { // Keep track of the proteins that have any annotation let mut proteins_with_annotations: HashSet<&str> = HashSet::new(); @@ -76,7 +76,7 @@ pub fn calculate_fa(proteins: &[&Protein]) -> FunctionalAggregation { let mut data: HashMap = HashMap::new(); - for &protein in proteins.iter() { + for protein in proteins { for ec_number in protein.get_ec_numbers().split(';') { proteins_with_ec.insert(&protein.uniprot_id); proteins_with_annotations.insert(&protein.uniprot_id); diff --git a/index/Cargo.toml b/index/Cargo.toml index 5e85b19..240cf62 100644 --- a/index/Cargo.toml +++ b/index/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "cad3fb5967d28e8d2035a6dbd6aaf704bfaf7ff0" } -sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "cad3fb5967d28e8d2035a6dbd6aaf704bfaf7ff0" } -sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "cad3fb5967d28e8d2035a6dbd6aaf704bfaf7ff0" } +sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "ad9b885636ef646e451f8eb447013880c8c47749" } +sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "ad9b885636ef646e451f8eb447013880c8c47749" } +sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "ad9b885636ef646e451f8eb447013880c8c47749" } thiserror = "1.0" diff --git a/index/src/lib.rs b/index/src/lib.rs index 49551a4..4854579 100644 --- a/index/src/lib.rs +++ b/index/src/lib.rs @@ -6,7 +6,7 @@ use std::str::from_utf8; pub use errors::IndexError; use errors::LoadIndexError; use sa_compression::load_compressed_suffix_array; -pub use sa_index::peptide_search::ProteinInfo; +pub use sa_index::peptide_search::ProteinsIterator; pub use sa_mappings::proteins::Protein; use sa_index::{ binary::load_suffix_array, @@ -19,7 +19,7 @@ use sa_mappings::proteins::Proteins; mod errors; pub struct Index { - searcher: SparseSearcher + pub searcher: SparseSearcher } impl Index { From 96fb5b368cde6fa55cf95a11a2183b6a47c74f53 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 30 Aug 2024 15:03:01 +0200 Subject: [PATCH 6/9] use bitvector-based searcher --- index/Cargo.toml | 6 +++--- index/src/lib.rs | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/index/Cargo.toml b/index/Cargo.toml index 240cf62..f3c3d48 100644 --- a/index/Cargo.toml +++ b/index/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "ad9b885636ef646e451f8eb447013880c8c47749" } -sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "ad9b885636ef646e451f8eb447013880c8c47749" } -sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "ad9b885636ef646e451f8eb447013880c8c47749" } +sa-compression = { git = "https://github.com/unipept/unipept-index.git", rev = "96e746e348d4ec38e6d3bfb65d04b5a3fa696e6c" } +sa-index = { git = "https://github.com/unipept/unipept-index.git", rev = "96e746e348d4ec38e6d3bfb65d04b5a3fa696e6c" } +sa-mappings = { git = "https://github.com/unipept/unipept-index.git", rev = "96e746e348d4ec38e6d3bfb65d04b5a3fa696e6c" } thiserror = "1.0" diff --git a/index/src/lib.rs b/index/src/lib.rs index 4854579..bf35ef6 100644 --- a/index/src/lib.rs +++ b/index/src/lib.rs @@ -14,12 +14,13 @@ use sa_index::{ sa_searcher::SparseSearcher, SuffixArray }; +use sa_index::sa_searcher::RankSearcher; use sa_mappings::proteins::Proteins; mod errors; pub struct Index { - pub searcher: SparseSearcher + pub searcher: RankSearcher } impl Index { @@ -30,7 +31,7 @@ impl Index { let proteins = Proteins::try_from_database_file(proteins_file) .map_err(|err| LoadIndexError::LoadProteinsErrors(err.to_string()))?; - let searcher = SparseSearcher::new(suffix_array, proteins, 5); + let searcher = RankSearcher::new(suffix_array, proteins, 5); Ok(Self { searcher }) } From a0a09b9181d518905b84fc02932a3a6b27b26a94 Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 30 Aug 2024 17:04:06 +0200 Subject: [PATCH 7/9] remove await from pept2ec request for testing purposes --- api/src/controllers/api/pept2ec.rs | 78 ++++++++++++++++++++++++++++++ api/src/routes.rs | 4 +- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/api/src/controllers/api/pept2ec.rs b/api/src/controllers/api/pept2ec.rs index b2182b3..3d232d7 100644 --- a/api/src/controllers/api/pept2ec.rs +++ b/api/src/controllers/api/pept2ec.rs @@ -13,6 +13,7 @@ use crate::{ }, AppState }; +use crate::controllers::request::{GetContent, PostContent}; use crate::helpers::fa_helper::calculate_ec; use crate::helpers::sanitize_peptides; @@ -79,3 +80,80 @@ generate_handlers!( Ok(Json(handler(state, params).await?)) } ); + +pub async fn test_get_json_handler( + State(AppState { index, datastore, .. }): State, + GetContent(Parameters { input, equate_il, extra }): GetContent +) -> Result>, ()> { + let input = sanitize_peptides(input); + + let mut peptide_counts: HashMap = HashMap::new(); + for peptide in input.into_iter() { + *peptide_counts.entry(peptide).or_insert(0) += 1; + } + + let unique_peptides: Vec = peptide_counts.keys().cloned().collect(); + let result = index.analyse(&unique_peptides, equate_il, None); + + let ec_store = datastore.ec_store(); + + // Step 6: Duplicate the results according to the original input + let mut final_results = Vec::new(); + for (unique_peptide, item) in unique_peptides.iter().zip(result.into_iter()) { + if let Some(count) = peptide_counts.get(unique_peptide) { + let fa = calculate_ec(item.proteins(&index.searcher)); + let total_protein_count = *fa.counts.get("all").unwrap_or(&0); + + for _ in 0..*count { + let ecs = ec_numbers_from_map(&fa.data, ec_store, extra); + + final_results.push(EcInformation { + peptide: item.sequence.clone(), + total_protein_count, + ec: ecs, + }); + } + } + } + + Ok(Json(final_results)) +} + + +pub async fn test_post_json_handler( + State(AppState { index, datastore, .. }): State, + PostContent(Parameters { input, equate_il, extra }): PostContent +) -> Result>, ()> { + let input = sanitize_peptides(input); + + let mut peptide_counts: HashMap = HashMap::new(); + for peptide in input.into_iter() { + *peptide_counts.entry(peptide).or_insert(0) += 1; + } + + let unique_peptides: Vec = peptide_counts.keys().cloned().collect(); + let result = index.analyse(&unique_peptides, equate_il, None); + + let ec_store = datastore.ec_store(); + + // Step 6: Duplicate the results according to the original input + let mut final_results = Vec::new(); + for (unique_peptide, item) in unique_peptides.iter().zip(result.into_iter()) { + if let Some(count) = peptide_counts.get(unique_peptide) { + let fa = calculate_ec(item.proteins(&index.searcher)); + let total_protein_count = *fa.counts.get("all").unwrap_or(&0); + + for _ in 0..*count { + let ecs = ec_numbers_from_map(&fa.data, ec_store, extra); + + final_results.push(EcInformation { + peptide: item.sequence.clone(), + total_protein_count, + ec: ecs, + }); + } + } + } + + Ok(Json(final_results)) +} diff --git a/api/src/routes.rs b/api/src/routes.rs index aaee9f8..714c15f 100644 --- a/api/src/routes.rs +++ b/api/src/routes.rs @@ -50,7 +50,7 @@ fn create_api_routes() -> Router { fn create_api_v1_routes() -> Router { define_routes!( "/pept2ec", - get(pept2ec::get_json_handler).post(pept2ec::post_json_handler), + get(pept2ec::test_get_json_handler).post(pept2ec::test_post_json_handler), "/pept2funct", get(pept2funct::get_json_handler).post(pept2funct::post_json_handler), "/pept2go", @@ -80,7 +80,7 @@ fn create_api_v1_routes() -> Router { fn create_api_v2_routes() -> Router { define_routes!( "/pept2ec", - get(pept2ec::get_json_handler).post(pept2ec::post_json_handler), + get(pept2ec::test_get_json_handler).post(pept2ec::test_get_json_handler), "/pept2funct", get(pept2funct::get_json_handler).post(pept2funct::post_json_handler), "/pept2go", From a6291c7b118d924b95a611592688a16f9b0222af Mon Sep 17 00:00:00 2001 From: tibvdm Date: Fri, 30 Aug 2024 18:28:56 +0200 Subject: [PATCH 8/9] wrong function for post-v2 requests --- api/src/routes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/src/routes.rs b/api/src/routes.rs index 714c15f..eac5cae 100644 --- a/api/src/routes.rs +++ b/api/src/routes.rs @@ -80,7 +80,7 @@ fn create_api_v1_routes() -> Router { fn create_api_v2_routes() -> Router { define_routes!( "/pept2ec", - get(pept2ec::test_get_json_handler).post(pept2ec::test_get_json_handler), + get(pept2ec::test_get_json_handler).post(pept2ec::test_post_json_handler), "/pept2funct", get(pept2funct::get_json_handler).post(pept2funct::post_json_handler), "/pept2go", From c1c9ea6a045475a8234ee63aa581f8f14ac9f195 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Tue, 3 Sep 2024 10:44:21 +0200 Subject: [PATCH 9/9] Updated the splitted API endpoints --- api/src/controllers/api/pept2ec.rs | 1 - api/src/controllers/api/pept2go.rs | 1 - api/src/controllers/api/pept2interpro.rs | 1 - 3 files changed, 3 deletions(-) diff --git a/api/src/controllers/api/pept2ec.rs b/api/src/controllers/api/pept2ec.rs index 3d232d7..34d492e 100644 --- a/api/src/controllers/api/pept2ec.rs +++ b/api/src/controllers/api/pept2ec.rs @@ -9,7 +9,6 @@ use crate::{ }, helpers::{ ec_helper::{ec_numbers_from_map, EcNumber}, - fa_helper::calculate_fa }, AppState }; diff --git a/api/src/controllers/api/pept2go.rs b/api/src/controllers/api/pept2go.rs index f4c2ed1..3d64d6e 100644 --- a/api/src/controllers/api/pept2go.rs +++ b/api/src/controllers/api/pept2go.rs @@ -7,7 +7,6 @@ use crate::{ generate_handlers }, helpers::{ - fa_helper::calculate_fa, go_helper::{go_terms_from_map, GoTerms} }, AppState diff --git a/api/src/controllers/api/pept2interpro.rs b/api/src/controllers/api/pept2interpro.rs index 9af5932..88ba089 100644 --- a/api/src/controllers/api/pept2interpro.rs +++ b/api/src/controllers/api/pept2interpro.rs @@ -7,7 +7,6 @@ use crate::{ generate_handlers }, helpers::{ - fa_helper::calculate_fa, interpro_helper::{interpro_entries_from_map, InterproEntries} }, AppState