From d67057cf04fb010d6b23a398cfdb762f89661a09 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 28 Aug 2024 10:13:41 +0200 Subject: [PATCH 1/2] Query database in batches of 1000 proteins --- Cargo.lock | 11 +++++++++++ README.md | 2 +- api/Cargo.toml | 1 + api/src/controllers/api/pept2prot.rs | 4 +++- api/src/controllers/api/protinfo.rs | 2 ++ api/src/controllers/private_api/proteins.rs | 3 ++- database/Cargo.toml | 1 + database/src/lib.rs | 22 +++++++++++++++++---- 8 files changed, 39 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5eea72d..8e16e36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -386,6 +386,7 @@ version = "0.1.0" dependencies = [ "deadpool-diesel", "diesel", + "itertools", "thiserror", ] @@ -703,6 +704,15 @@ version = "1.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -1416,6 +1426,7 @@ dependencies = [ "http", "http-body-util", "index", + "itertools", "paste", "serde", "serde_json", diff --git a/README.md b/README.md index 1b1ec41..13320a0 100644 --- a/README.md +++ b/README.md @@ -52,4 +52,4 @@ Follow these steps in order to easily work on the Unipept API in the devcontaine * You first have to build the binaries by running `cargo build --release`. * Then, you should start the mariadb server: `sudo service mariadb start` -* Finally, the Unipept API can be started with this command: `./target/release/unipept-api -i "/unipept-index-data" -d "mysql://root:unipept@127.0.0.1:3306/unipept" -p 80`. +* Finally, the Unipept API can be started with this command: `./target/release/unipept-api -i "/unipept-index-data" -d "mysql://root:root_pass@localhost:3306/unipept" -p 80`. diff --git a/api/Cargo.toml b/api/Cargo.toml index e840d74..bef0020 100644 --- a/api/Cargo.toml +++ b/api/Cargo.toml @@ -25,3 +25,4 @@ urlencoding = "2.1.3" http = "1.1.0" tower-layer = "0.3.2" tower-service = "0.3.2" +itertools = "0.13.0" diff --git a/api/src/controllers/api/pept2prot.rs b/api/src/controllers/api/pept2prot.rs index 78bb50f..7c8b945 100644 --- a/api/src/controllers/api/pept2prot.rs +++ b/api/src/controllers/api/pept2prot.rs @@ -1,4 +1,6 @@ +use std::collections::HashSet; use axum::{extract::State, Json}; +use itertools::Itertools; use database::get_accessions_map; use serde::{Deserialize, Serialize}; @@ -55,7 +57,7 @@ async fn handler( let result = index.analyse(&input, equate_il, None); - let accession_numbers: Vec = result + let accession_numbers: HashSet = result .iter() .flat_map(|item| item.proteins.iter().map(|protein| protein.uniprot_accession.clone())) .collect(); diff --git a/api/src/controllers/api/protinfo.rs b/api/src/controllers/api/protinfo.rs index c00b2d3..4300d34 100644 --- a/api/src/controllers/api/protinfo.rs +++ b/api/src/controllers/api/protinfo.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use axum::{extract::State, Json}; use database::get_accessions; use serde::{Deserialize, Serialize}; @@ -58,6 +59,7 @@ async fn handler( version: LineageVersion ) -> Result, ApiError> { let input = sanitize_proteins(input); + let input = HashSet::from_iter(input.into_iter()); let connection = database.get_conn().await?; diff --git a/api/src/controllers/private_api/proteins.rs b/api/src/controllers/private_api/proteins.rs index 750ce1d..a2dddbc 100644 --- a/api/src/controllers/private_api/proteins.rs +++ b/api/src/controllers/private_api/proteins.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use axum::{extract::State, Json}; use database::get_accessions_map; use serde::{Deserialize, Serialize}; @@ -57,7 +58,7 @@ async fn handler( return Ok(ProteinInformation::default()); } - let accession_numbers: Vec = + let accession_numbers: HashSet = result[0].proteins.iter().map(|protein| protein.uniprot_accession.clone()).collect(); let accessions_map = connection.interact(move |conn| get_accessions_map(conn, &accession_numbers)).await??; diff --git a/database/Cargo.toml b/database/Cargo.toml index 29ab330..2b03cf6 100644 --- a/database/Cargo.toml +++ b/database/Cargo.toml @@ -6,4 +6,5 @@ edition = "2021" [dependencies] deadpool-diesel = { version = "0.4.1", features = ["mysql"] } diesel = { version = "2", features = ["mysql"] } +itertools = "0.13.0" thiserror = "1.0" diff --git a/database/src/lib.rs b/database/src/lib.rs index 3db0349..fb76482 100644 --- a/database/src/lib.rs +++ b/database/src/lib.rs @@ -1,10 +1,11 @@ use std::{collections::HashMap, ops::Deref}; - +use std::collections::HashSet; use deadpool_diesel::mysql::{Manager, Object, Pool}; pub use deadpool_diesel::InteractError; use diesel::{prelude::*, MysqlConnection, QueryDsl}; pub use errors::DatabaseError; use models::UniprotEntry; +use itertools::Itertools; mod errors; mod models; @@ -36,16 +37,29 @@ impl Deref for Database { pub fn get_accessions( conn: &mut MysqlConnection, - accessions: &Vec + accessions: &HashSet ) -> Result, DatabaseError> { use schema::uniprot_entries::dsl::*; - Ok(uniprot_entries.filter(uniprot_accession_number.eq_any(accessions)).load(conn)?) + let mut result: Vec = Vec::new(); + + accessions + .into_iter() + .chunks(1000) + .into_iter() + .for_each(|chunk| { + let data = uniprot_entries.filter(uniprot_accession_number.eq_any(accessions)).load(conn); + if data.is_ok() { + result.extend(data.unwrap()); + } + }); + + Ok(result) } pub fn get_accessions_map( conn: &mut MysqlConnection, - accessions: &Vec + accessions: &HashSet ) -> Result, DatabaseError> { Ok(get_accessions(conn, accessions)? .into_iter() From 013198606760ee21f8000d803eaf833ca5d6f6b8 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 28 Aug 2024 10:15:50 +0200 Subject: [PATCH 2/2] Forgot to use variable chunk --- database/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/database/src/lib.rs b/database/src/lib.rs index fb76482..af65ff1 100644 --- a/database/src/lib.rs +++ b/database/src/lib.rs @@ -48,7 +48,7 @@ pub fn get_accessions( .chunks(1000) .into_iter() .for_each(|chunk| { - let data = uniprot_entries.filter(uniprot_accession_number.eq_any(accessions)).load(conn); + let data = uniprot_entries.filter(uniprot_accession_number.eq_any(chunk)).load(conn); if data.is_ok() { result.extend(data.unwrap()); }