From 358481c71dea1b6287a99a83ea523453d55377b4 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Fri, 27 Oct 2023 12:13:41 +0200 Subject: [PATCH 01/18] First compiling version of rust LCAs --- .../new-parsers/src/calculate_lcas/mod.rs | 1 + .../src/calculate_lcas/taxonomy.rs | 143 ++++++++++++++++++ .../unipept-database-rs/src/bin/lcas.rs | 14 ++ .../unipept-database-rs/src/lib.rs | 1 + 4 files changed, 159 insertions(+) create mode 100644 scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs create mode 100644 scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs create mode 100644 scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs diff --git a/scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs b/scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs new file mode 100644 index 00000000..652b0277 --- /dev/null +++ b/scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs @@ -0,0 +1 @@ +pub mod taxonomy; diff --git a/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs new file mode 100644 index 00000000..801e8c81 --- /dev/null +++ b/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs @@ -0,0 +1,143 @@ +use std::collections::HashMap; +use std::io::BufRead; +use std::path::PathBuf; + +use crate::taxons_uniprots_tables::utils::now_str; +use crate::utils::files::{open_read, open_sin}; + +const GENUS: u8 = 18; +const RANKS: u8 = 27; +const SPECIES: u8 = 22; +const NULL: &str = "\\N"; +const SEPARATOR: &str = "\t"; + +pub struct Taxonomy { + taxonomy: Vec>, +} + +impl Taxonomy { + pub fn build(infile: &PathBuf) -> Self { + let mut taxonomy_map: HashMap> = HashMap::new(); + let reader = open_read(infile); + + let mut max = i32::MIN; + + for line in reader.lines() { + let line = line.expect("error reading line"); + let elements: Vec = line.splitn(28, SEPARATOR).map(String::from).collect(); + + let key: i32 = elements[0].parse().expect("error parsing integer value"); + let lineage = elements.iter().skip(1).map(parse_int).collect(); + taxonomy_map.insert(key, lineage); + + // Keep track of highest key + if key > max { + max = key; + } + } + + let mut taxonomy = vec![Vec::new(); (max + 1) as usize]; + + for (key, value) in taxonomy_map { + taxonomy[key as usize] = value; + } + + Taxonomy { + taxonomy, + } + } + + pub fn calculate_lcas(&self) { + let reader = open_sin(); + + let mut current_sequence = String::new(); + let mut taxa: Vec = Vec::new(); + + for (i, line) in reader.lines().enumerate() { + if i % 10000000 == 0 { + eprintln!("{}: {}", now_str(), i); + } + + let line = line.expect("error reading line from stdin"); + + let (sequence, taxon_id) = line.split_once(SEPARATOR).expect("error splitting line"); + let taxon_id: i32 = taxon_id.trim_end().parse().expect("error parsing taxon id to int"); + + if current_sequence.is_empty() || current_sequence != sequence { + if !current_sequence.is_empty() { + self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); + } + + current_sequence = sequence.to_string(); + taxa.clear(); + } + + taxa.push(taxon_id); + } + + self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); + } + + fn calculate_lca(&self, taxa: &Vec) -> i32 { + let mut lca = 1; + + let lineages: Vec<&Vec> = taxa.iter().map(|x| &self.taxonomy[*x as usize]).filter(|x| !x.is_empty()).collect(); + + for rank in 0..RANKS { + let final_rank = rank; + let mut value = -1; + + let iterator = lineages.iter() + .map(|&x| x[final_rank as usize]) + .filter(|&x| if final_rank == GENUS || final_rank == SPECIES { x > 0 } else { x >= 0 }); + + let mut all_match = true; + + // This was near-impossible to do with the iterators above, + // so we're using a simplified loop here + for item in iterator { + if value == -1 { + value = item; + } else { + if item != value { + all_match = false; + break; + } + } + } + + // If we found a new value that matched for all of them, use this as the new best + if value != -1 { + // If not everything matched, this is not a common ancestor anymore, + // so we can stop + if !all_match { + break; + } + + if value != 0 { + lca = value; + } + } + } + + lca + } + + fn handle_lca(&self, sequence: &String, lca: i32) { + println!("{}\t{}", sequence, lca); + } +} + +fn parse_int(s: &String) -> i32 { + if s == NULL { + return 0; + } + + match s.parse::() { + Ok(v) => v, + Err(e) => { + eprintln!("error parsing {} as an integer: {:?}", s, e); + std::process::exit(1); + } + } +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs new file mode 100644 index 00000000..35c44b95 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs @@ -0,0 +1,14 @@ +use std::path::PathBuf; +use clap::Parser; +use unipept::calculate_lcas::taxonomy::Taxonomy; + +#[derive(Parser)] +struct Cli { + #[clap(long)] + infile: PathBuf +} + +fn main() { + let args = Cli::parse(); + let _ = Taxonomy::build(&args.infile); +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs index 262d6198..b9687f88 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/lib.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs @@ -1,2 +1,3 @@ +pub mod calculate_lcas; pub mod taxons_uniprots_tables; pub mod utils; From 5f0523d85ab7c2fbbef41cbe24177faaea89875e Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Wed, 8 Nov 2023 12:02:34 +0100 Subject: [PATCH 02/18] Linting --- .../new-parsers/src/calculate_lcas/taxonomy.rs | 8 +++----- .../helper_scripts/unipept-database-rs/src/bin/lcas.rs | 8 +++++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs index 801e8c81..6a4d76e5 100644 --- a/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs @@ -54,7 +54,7 @@ impl Taxonomy { let mut taxa: Vec = Vec::new(); for (i, line) in reader.lines().enumerate() { - if i % 10000000 == 0 { + if i % 10000000 == 0 && i != 0 { eprintln!("{}: {}", now_str(), i); } @@ -78,7 +78,7 @@ impl Taxonomy { self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); } - fn calculate_lca(&self, taxa: &Vec) -> i32 { + fn calculate_lca(&self, taxa: &[i32]) -> i32 { let mut lca = 1; let lineages: Vec<&Vec> = taxa.iter().map(|x| &self.taxonomy[*x as usize]).filter(|x| !x.is_empty()).collect(); @@ -98,11 +98,9 @@ impl Taxonomy { for item in iterator { if value == -1 { value = item; - } else { - if item != value { + } else if item != value { all_match = false; break; - } } } diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs index 35c44b95..7f336d5f 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs @@ -1,6 +1,7 @@ use std::path::PathBuf; use clap::Parser; use unipept::calculate_lcas::taxonomy::Taxonomy; +use unipept::taxons_uniprots_tables::utils::now_str; #[derive(Parser)] struct Cli { @@ -10,5 +11,10 @@ struct Cli { fn main() { let args = Cli::parse(); - let _ = Taxonomy::build(&args.infile); + + eprintln!("{}: reading taxonomy", now_str()); + let tax = Taxonomy::build(&args.infile); + + eprintln!("{}: reading sequences", now_str()); + tax.calculate_lcas(); } From 0f17c9d75d4ce0b781355e9872acdcce51a688d7 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Wed, 8 Nov 2023 20:29:42 +0100 Subject: [PATCH 03/18] Update error handling --- .../unipept-database-rs/src/bin/lcas.rs | 15 +- .../src/calculate_lcas/mod.rs | 1 + .../src/calculate_lcas/taxonomy.rs | 141 ++++++++++++++++++ 3 files changed, 150 insertions(+), 7 deletions(-) create mode 100644 scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs create mode 100644 scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs index 7f336d5f..cc1b1179 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs @@ -1,7 +1,8 @@ use std::path::PathBuf; use clap::Parser; -use unipept::calculate_lcas::taxonomy::Taxonomy; -use unipept::taxons_uniprots_tables::utils::now_str; +use unipept_database::calculate_lcas::taxonomy::Taxonomy; +use unipept_database::taxons_uniprots_tables::utils::now_str; +use anyhow::{Context, Result}; #[derive(Parser)] struct Cli { @@ -9,12 +10,12 @@ struct Cli { infile: PathBuf } -fn main() { +fn main() -> Result<()> { let args = Cli::parse(); - eprintln!("{}: reading taxonomy", now_str()); - let tax = Taxonomy::build(&args.infile); + eprintln!("{}: reading taxonomy", now_str()?); + let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?; - eprintln!("{}: reading sequences", now_str()); - tax.calculate_lcas(); + eprintln!("{}: reading sequences", now_str()?); + Ok(tax.calculate_lcas()?) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs new file mode 100644 index 00000000..652b0277 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs @@ -0,0 +1 @@ +pub mod taxonomy; diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs new file mode 100644 index 00000000..11ab356d --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -0,0 +1,141 @@ +use std::collections::HashMap; +use std::io::BufRead; +use std::path::PathBuf; + +use anyhow::{Context, Result}; + +use crate::taxons_uniprots_tables::utils::now_str; +use crate::utils::files::{open_read, open_sin}; + +const GENUS: u8 = 18; +const RANKS: u8 = 27; +const SPECIES: u8 = 22; +const NULL: &str = "\\N"; +const SEPARATOR: &str = "\t"; + +pub struct Taxonomy { + taxonomy: Vec>, +} + +impl Taxonomy { + pub fn build(infile: &PathBuf) -> Result { + let mut taxonomy_map: HashMap> = HashMap::new(); + let reader = open_read(infile).context("Unable to open input file")?; + + let mut max = i32::MIN; + + for line in reader.lines() { + let line = line + .with_context(|| format!("Error reading line from input file {}", infile.display()))?; + let elements: Vec = line.splitn(28, SEPARATOR).map(String::from).collect(); + + let key = parse_int(&elements[0])?; + // Note on the collect::<> here: "?" can't be used inside of map() as it is a closure + // Collecting into a Result> will stop instantly when it receives one Error + // https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect + let lineage = elements.iter().skip(1).map(parse_int).collect::>>()?; + taxonomy_map.insert(key, lineage); + + // Keep track of highest key + if key > max { + max = key; + } + } + + let mut taxonomy = vec![Vec::new(); (max + 1) as usize]; + + for (key, value) in taxonomy_map { + taxonomy[key as usize] = value; + } + + Ok(Taxonomy { + taxonomy, + }) + } + + pub fn calculate_lcas(&self) -> Result<()> { + let reader = open_sin(); + + let mut current_sequence = String::new(); + let mut taxa: Vec = Vec::new(); + + for (i, line) in reader.lines().enumerate() { + if i % 10000000 == 0 && i != 0 { + eprintln!("{}: {}", now_str()?, i); + } + + let line = line.expect("error reading line from stdin"); + + let (sequence, taxon_id) = line.split_once(SEPARATOR).expect("error splitting line"); + let taxon_id: i32 = taxon_id.trim_end().parse().expect("error parsing taxon id to int"); + + if current_sequence.is_empty() || current_sequence != sequence { + if !current_sequence.is_empty() { + self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); + } + + current_sequence = sequence.to_string(); + taxa.clear(); + } + + taxa.push(taxon_id); + } + + Ok(self.handle_lca(¤t_sequence, self.calculate_lca(&taxa))) + } + + fn calculate_lca(&self, taxa: &[i32]) -> i32 { + let mut lca = 1; + + let lineages: Vec<&Vec> = taxa.iter().map(|x| &self.taxonomy[*x as usize]).filter(|x| !x.is_empty()).collect(); + + for rank in 0..RANKS { + let final_rank = rank; + let mut value = -1; + + let iterator = lineages.iter() + .map(|&x| x[final_rank as usize]) + .filter(|&x| if final_rank == GENUS || final_rank == SPECIES { x > 0 } else { x >= 0 }); + + let mut all_match = true; + + // This was near-impossible to do with the iterators above, + // so we're using a simplified loop here + for item in iterator { + if value == -1 { + value = item; + } else if item != value { + all_match = false; + break; + } + } + + // If we found a new value that matched for all of them, use this as the new best + if value != -1 { + // If not everything matched, this is not a common ancestor anymore, + // so we can stop + if !all_match { + break; + } + + if value != 0 { + lca = value; + } + } + } + + lca + } + + fn handle_lca(&self, sequence: &String, lca: i32) { + println!("{}\t{}", sequence, lca); + } +} + +fn parse_int(s: &String) -> Result { + if s == NULL { + return Ok(0); + } + + Ok(s.parse::().with_context(|| format!("Error parsing {} as an integer", s))?) +} From eeb117246bf45799a9e0314425b5c2cc1fbed756 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Wed, 8 Nov 2023 20:29:55 +0100 Subject: [PATCH 04/18] Remove merge artifact --- .../new-parsers/src/calculate_lcas/mod.rs | 1 - .../src/calculate_lcas/taxonomy.rs | 141 ------------------ 2 files changed, 142 deletions(-) delete mode 100644 scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs delete mode 100644 scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs diff --git a/scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs b/scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs deleted file mode 100644 index 652b0277..00000000 --- a/scripts/helper_scripts/new-parsers/src/calculate_lcas/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub mod taxonomy; diff --git a/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs deleted file mode 100644 index 6a4d76e5..00000000 --- a/scripts/helper_scripts/new-parsers/src/calculate_lcas/taxonomy.rs +++ /dev/null @@ -1,141 +0,0 @@ -use std::collections::HashMap; -use std::io::BufRead; -use std::path::PathBuf; - -use crate::taxons_uniprots_tables::utils::now_str; -use crate::utils::files::{open_read, open_sin}; - -const GENUS: u8 = 18; -const RANKS: u8 = 27; -const SPECIES: u8 = 22; -const NULL: &str = "\\N"; -const SEPARATOR: &str = "\t"; - -pub struct Taxonomy { - taxonomy: Vec>, -} - -impl Taxonomy { - pub fn build(infile: &PathBuf) -> Self { - let mut taxonomy_map: HashMap> = HashMap::new(); - let reader = open_read(infile); - - let mut max = i32::MIN; - - for line in reader.lines() { - let line = line.expect("error reading line"); - let elements: Vec = line.splitn(28, SEPARATOR).map(String::from).collect(); - - let key: i32 = elements[0].parse().expect("error parsing integer value"); - let lineage = elements.iter().skip(1).map(parse_int).collect(); - taxonomy_map.insert(key, lineage); - - // Keep track of highest key - if key > max { - max = key; - } - } - - let mut taxonomy = vec![Vec::new(); (max + 1) as usize]; - - for (key, value) in taxonomy_map { - taxonomy[key as usize] = value; - } - - Taxonomy { - taxonomy, - } - } - - pub fn calculate_lcas(&self) { - let reader = open_sin(); - - let mut current_sequence = String::new(); - let mut taxa: Vec = Vec::new(); - - for (i, line) in reader.lines().enumerate() { - if i % 10000000 == 0 && i != 0 { - eprintln!("{}: {}", now_str(), i); - } - - let line = line.expect("error reading line from stdin"); - - let (sequence, taxon_id) = line.split_once(SEPARATOR).expect("error splitting line"); - let taxon_id: i32 = taxon_id.trim_end().parse().expect("error parsing taxon id to int"); - - if current_sequence.is_empty() || current_sequence != sequence { - if !current_sequence.is_empty() { - self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); - } - - current_sequence = sequence.to_string(); - taxa.clear(); - } - - taxa.push(taxon_id); - } - - self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); - } - - fn calculate_lca(&self, taxa: &[i32]) -> i32 { - let mut lca = 1; - - let lineages: Vec<&Vec> = taxa.iter().map(|x| &self.taxonomy[*x as usize]).filter(|x| !x.is_empty()).collect(); - - for rank in 0..RANKS { - let final_rank = rank; - let mut value = -1; - - let iterator = lineages.iter() - .map(|&x| x[final_rank as usize]) - .filter(|&x| if final_rank == GENUS || final_rank == SPECIES { x > 0 } else { x >= 0 }); - - let mut all_match = true; - - // This was near-impossible to do with the iterators above, - // so we're using a simplified loop here - for item in iterator { - if value == -1 { - value = item; - } else if item != value { - all_match = false; - break; - } - } - - // If we found a new value that matched for all of them, use this as the new best - if value != -1 { - // If not everything matched, this is not a common ancestor anymore, - // so we can stop - if !all_match { - break; - } - - if value != 0 { - lca = value; - } - } - } - - lca - } - - fn handle_lca(&self, sequence: &String, lca: i32) { - println!("{}\t{}", sequence, lca); - } -} - -fn parse_int(s: &String) -> i32 { - if s == NULL { - return 0; - } - - match s.parse::() { - Ok(v) => v, - Err(e) => { - eprintln!("error parsing {} as an integer: {:?}", s, e); - std::process::exit(1); - } - } -} From 0565ab3df3b5690de17293ff21d1bdfb051b3ff9 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Tue, 14 Nov 2023 09:42:51 +0100 Subject: [PATCH 05/18] Fix error handling --- .../helper_scripts/unipept-database-rs/src/bin/lcas.rs | 4 ++-- .../unipept-database-rs/src/calculate_lcas/taxonomy.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs index cc1b1179..e36f90fc 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs @@ -13,9 +13,9 @@ struct Cli { fn main() -> Result<()> { let args = Cli::parse(); - eprintln!("{}: reading taxonomy", now_str()?); + eprintln!("{}: reading taxonomy", now_str()); let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?; - eprintln!("{}: reading sequences", now_str()?); + eprintln!("{}: reading sequences", now_str()); Ok(tax.calculate_lcas()?) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index 11ab356d..278ce86a 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -61,13 +61,13 @@ impl Taxonomy { for (i, line) in reader.lines().enumerate() { if i % 10000000 == 0 && i != 0 { - eprintln!("{}: {}", now_str()?, i); + eprintln!("{}: {}", now_str(), i); } - let line = line.expect("error reading line from stdin"); + let line = line.context("error reading line from stdin")?; - let (sequence, taxon_id) = line.split_once(SEPARATOR).expect("error splitting line"); - let taxon_id: i32 = taxon_id.trim_end().parse().expect("error parsing taxon id to int"); + let (sequence, taxon_id) = line.split_once(SEPARATOR).context("error splitting line")?; + let taxon_id: i32 = taxon_id.trim_end().parse().context("error parsing taxon id to int")?; if current_sequence.is_empty() || current_sequence != sequence { if !current_sequence.is_empty() { From adf56e18b48136a5ebb0ad0af42845db6dee5941 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Thu, 9 Nov 2023 23:43:48 +0100 Subject: [PATCH 06/18] Parsing and invalidating taxon dumps --- .../unipept-database-rs/Cargo.lock | 39 +++++ .../unipept-database-rs/Cargo.toml | 1 + .../src/bin/taxons-lineages.rs | 25 +++ .../unipept-database-rs/src/lib.rs | 1 + .../src/taxons_lineages/mod.rs | 1 + .../src/taxons_lineages/taxon_list.rs | 153 ++++++++++++++++++ .../src/taxons_uniprots_tables/models.rs | 11 +- .../src/taxons_uniprots_tables/taxon_list.rs | 59 +------ 8 files changed, 228 insertions(+), 62 deletions(-) create mode 100644 scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs create mode 100644 scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs create mode 100644 scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.lock b/scripts/helper_scripts/unipept-database-rs/Cargo.lock index af2c0268..b0d329c5 100644 --- a/scripts/helper_scripts/unipept-database-rs/Cargo.lock +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -309,6 +318,35 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "smartstring" version = "1.0.1" @@ -357,6 +395,7 @@ dependencies = [ "bit-vec", "chrono", "clap", + "regex", "smartstring", "uniprot", ] diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.toml b/scripts/helper_scripts/unipept-database-rs/Cargo.toml index 07f6cfdc..37fe9e83 100644 --- a/scripts/helper_scripts/unipept-database-rs/Cargo.toml +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.toml @@ -15,5 +15,6 @@ anyhow = "1.0.75" bit-vec = "0.6.3" chrono = "0.4.31" clap = { version = "4.4.6", features = ["derive"] } +regex = "1.10.2" smartstring = { version = "1.0" } uniprot = "0.7.0" diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs new file mode 100644 index 00000000..6d54a3b2 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs @@ -0,0 +1,25 @@ +use std::path::PathBuf; +use clap::Parser; +use unipept_database::taxons_lineages::taxon_list::TaxonList; +use anyhow::{Context, Result}; + +fn main() -> Result<()>{ + let args = Cli::parse(); + + let mut tl = TaxonList::from_dumps(&args.names, &args.nodes).context("Failed to parse TaxonList from dumps")?; + tl.invalidate().context("Failed to validate TaxonList")?; + + Ok(()) +} + +#[derive(Parser, Debug)] +struct Cli { + #[clap(short, long)] + names: PathBuf, + #[clap(short, long)] + nodes: PathBuf, + #[clap(short, long)] + taxons: PathBuf, + #[clap(short, long)] + lineages: PathBuf, +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs index b9687f88..9200454d 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/lib.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs @@ -1,3 +1,4 @@ pub mod calculate_lcas; +pub mod taxons_lineages; pub mod taxons_uniprots_tables; pub mod utils; diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs new file mode 100644 index 00000000..552415d3 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs @@ -0,0 +1 @@ +pub mod taxon_list; diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs new file mode 100644 index 00000000..647637d7 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs @@ -0,0 +1,153 @@ +use std::io::{BufRead, Read}; +use std::path::PathBuf; +use std::str::FromStr; + +use anyhow::{Context, Error, Result}; +use regex::Regex; + +use crate::taxons_uniprots_tables::models::{Rank, Taxon}; +use crate::utils::files::open_read; + +pub struct TaxonList { + entries: Vec>, +} + +impl TaxonList { + /// Parse a list of Taxons from the names and nodes dumps + pub fn from_dumps(names_pb: &PathBuf, nodes_pb: &PathBuf) -> Result { + let scientific_name = "SCIENTIFIC_NAME"; + let pattern = "\\|"; + + let mut entries = vec![]; + + let mut names = open_read(names_pb).context("Unable to open names dump file")?; + let nodes = open_read(nodes_pb).context("Unable to open nodes dump file")?; + + for node_line in nodes.lines() { + let node_line = node_line.context("Error reading line from nodes dump file")?; + let node_row: Vec<&str> = node_line.split(pattern).collect(); + + let taxon_id = parse_id(node_row[0])?; + let parent_id = parse_id(node_row[1])?; + + let rank = Rank::from_str(node_row[2].trim()).context("Unable to parse Taxon Rank")?; + + let mut name = String::new(); + let mut clas = String::new(); + let mut taxon_id2 = usize::MAX; + + for name_line in names.by_ref().lines() { + let name_line = name_line.context("Error reading line from names dump file")?; + let name_row: Vec<&str> = name_line.split(pattern).collect(); + taxon_id2 = parse_id(name_row[0])?; + name = name_row[1].trim().to_string(); + clas = name_row[3].trim().to_string(); + + if clas == scientific_name { + break; + } + } + + if clas == scientific_name && taxon_id == taxon_id2 { + while entries.len() <= taxon_id { + entries.push(None); + } + + entries[taxon_id] = Some(Taxon::new( + name, + rank, + parent_id, + true, + )); + } else { + return Err(Error::msg(format!("Taxon {} did not have a scientific name", taxon_id))); + } + } + + Ok(TaxonList { + entries, + }) + } + + pub fn invalidate(&mut self) -> Result<()> { + for i in 0..self.entries.len() { + self.validate(i)?; + } + + Ok(()) + } + + fn validate(&mut self, id: usize) -> Result { + let re = Regex::new(r".*\\d.*").context("Failed to initialize regex")?; + + let taxon = self.entries.get_mut(id).with_context(|| format!("Missing Taxon with id {}", id))?; + let taxon = match taxon { + Some(t) => t, + None => return Ok(false), + }; + + // TODO big if statement + if !taxon.valid + || (taxon.rank == Rank::Species + && ( + (re.is_match(taxon.name.as_str()) && !taxon.name.contains("virus")) + || taxon.name.ends_with(" sp.") + || taxon.name.ends_with(" genomosp.") + || taxon.name.ends_with(" bacterium") + ) + ) + || taxon.name.contains("enrichment culture") + || taxon.name.contains("mixed culture") + || taxon.name.contains("uncultured") + || taxon.name.contains("unidentified") + || taxon.name.contains("unspecified") + || taxon.name.contains("undetermined") + || taxon.name.contains("sample") + || taxon.name.ends_with("metagenome") + || taxon.name.ends_with("library") + || id == 28384 + || id == 48479 + || id == 1869227 { + taxon.valid = false; + return Ok(false); + } + + if id == 1 { + return Ok(true); + } + + let parent = taxon.parent; + let parent_valid = self.validate(parent)?; + + // I don't like this duplication but we have to do it because of the borrow checker + // Otherwise, the recursive call above ^ will cause two mutable references at the same time + // And we need one to mark the taxon as invalid + let taxon = self.entries.get_mut(id).with_context(|| format!("Missing Taxon with id {}", id))?; + let taxon = match taxon { + Some(t) => t, + None => return Ok(false), + }; + + if !parent_valid { + taxon.valid = false; + } + + return Ok(taxon.valid); + } + + pub fn get(&self, i: usize) -> &Option { + &self.entries[i] + } + + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + pub fn len(&self) -> usize { + self.entries.len() + } +} + +fn parse_id(v: &str) -> Result { + v.trim().parse::().with_context(|| format!("Unable to parse {} as usize", v)) +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs index a0c70214..9f87b5ed 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs @@ -87,7 +87,7 @@ pub fn calculate_entry_digest( result } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Rank { NoRank, SuperKingdom, @@ -160,13 +160,12 @@ impl FromStr for Rank { } } -#[allow(dead_code)] // The fields in this struct aren't used YET, but will be later on #[derive(Debug)] pub struct Taxon { - name: String, - rank: Rank, - parent: usize, - valid: bool, + pub name: String, + pub rank: Rank, + pub parent: usize, + pub valid: bool, } impl Taxon { diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs index fb876817..2c1b53a2 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs @@ -1,63 +1,10 @@ -use crate::taxons_uniprots_tables::models::{Rank, Taxon}; -use anyhow::{Context, Result}; -use bit_vec::BitVec; use std::io::BufRead; use std::path::PathBuf; -use std::str::FromStr; - -use crate::utils::files::open_read; - -pub struct TaxonList { - entries: Vec>, -} - -impl TaxonList { - pub fn from_file(pb: &PathBuf) -> Result { - let mut entries = Vec::new(); - let reader = open_read(pb).context("Unable to open input file")?; - - for line in reader.lines() { - let line = line - .with_context(|| format!("Error reading line from input file {}", pb.display()))?; - let spl: Vec<&str> = line.split('\t').collect(); - let id: usize = spl[0] - .parse() - .with_context(|| format!("Unable to parse {} as usize", spl[0]))?; - let parent: usize = spl[3] - .parse() - .with_context(|| format!("Unable to parse {} as usize", spl[3]))?; - let valid = spl[4].trim() == "true"; - - let taxon = Taxon::new( - spl[1].to_string(), - Rank::from_str(spl[2]) - .with_context(|| format!("Unable to parse {} into Rank", spl[2]))?, - parent, - valid, - ); - - while entries.len() <= id { - entries.push(None); - } - - entries[id] = Some(taxon); - } - - Ok(TaxonList { entries }) - } - pub fn get(&self, i: usize) -> &Option { - &self.entries[i] - } - - pub fn is_empty(&self) -> bool { - self.entries.is_empty() - } +use anyhow::{Context, Result}; +use bit_vec::BitVec; - pub fn len(&self) -> usize { - self.entries.len() - } -} +use crate::utils::files::open_read; /// Parse a taxons TSV-file into a vector that can be accessed by id /// The actual content of these Taxons is never used, so we don't try to parse a struct From c1b20e3173580a585c68023a011280f796273f3a Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Fri, 10 Nov 2023 12:59:39 +0100 Subject: [PATCH 07/18] First version of validation --- .../unipept-database-rs/Cargo.lock | 27 +++++ .../unipept-database-rs/Cargo.toml | 2 + .../src/bin/taxons-lineages.rs | 5 + .../src/taxons_lineages/taxon_list.rs | 102 ++++++++++++++++- .../src/taxons_uniprots_tables/models.rs | 108 ++++++------------ 5 files changed, 168 insertions(+), 76 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.lock b/scripts/helper_scripts/unipept-database-rs/Cargo.lock index b0d329c5..2a5816d6 100644 --- a/scripts/helper_scripts/unipept-database-rs/Cargo.lock +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.lock @@ -347,6 +347,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + [[package]] name = "smartstring" version = "1.0.1" @@ -370,6 +376,25 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "syn" version = "2.0.37" @@ -397,6 +422,8 @@ dependencies = [ "clap", "regex", "smartstring", + "strum", + "strum_macros", "uniprot", ] diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.toml b/scripts/helper_scripts/unipept-database-rs/Cargo.toml index 37fe9e83..e83af45c 100644 --- a/scripts/helper_scripts/unipept-database-rs/Cargo.toml +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.toml @@ -17,4 +17,6 @@ chrono = "0.4.31" clap = { version = "4.4.6", features = ["derive"] } regex = "1.10.2" smartstring = { version = "1.0" } +strum = "0.25.0" +strum_macros = "0.25.3" uniprot = "0.7.0" diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs index 6d54a3b2..f3a02e28 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs @@ -7,7 +7,12 @@ fn main() -> Result<()>{ let args = Cli::parse(); let mut tl = TaxonList::from_dumps(&args.names, &args.nodes).context("Failed to parse TaxonList from dumps")?; + eprintln!("Done loading dumps"); tl.invalidate().context("Failed to validate TaxonList")?; + eprintln!("Done invalidating"); + tl.write_taxons(&args.taxons).context("Failed to write TaxonList")?; + eprintln!("Done writing taxons"); + tl.write_lineages(&args.lineages).context("Failed to write lineages")?; Ok(()) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs index 647637d7..81fb124c 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs @@ -1,12 +1,13 @@ -use std::io::{BufRead, Read}; +use std::io::{BufRead, Read, Write}; use std::path::PathBuf; use std::str::FromStr; use anyhow::{Context, Error, Result}; use regex::Regex; +use strum::IntoEnumIterator; use crate::taxons_uniprots_tables::models::{Rank, Taxon}; -use crate::utils::files::open_read; +use crate::utils::files::{open_read, open_write}; pub struct TaxonList { entries: Vec>, @@ -15,8 +16,8 @@ pub struct TaxonList { impl TaxonList { /// Parse a list of Taxons from the names and nodes dumps pub fn from_dumps(names_pb: &PathBuf, nodes_pb: &PathBuf) -> Result { - let scientific_name = "SCIENTIFIC_NAME"; - let pattern = "\\|"; + let scientific_name = "scientific name"; + let pattern = "|"; let mut entries = vec![]; @@ -71,6 +72,7 @@ impl TaxonList { pub fn invalidate(&mut self) -> Result<()> { for i in 0..self.entries.len() { + eprintln!("Validating {}", i); self.validate(i)?; } @@ -86,7 +88,6 @@ impl TaxonList { None => return Ok(false), }; - // TODO big if statement if !taxon.valid || (taxon.rank == Rank::Species && ( @@ -122,7 +123,7 @@ impl TaxonList { // I don't like this duplication but we have to do it because of the borrow checker // Otherwise, the recursive call above ^ will cause two mutable references at the same time // And we need one to mark the taxon as invalid - let taxon = self.entries.get_mut(id).with_context(|| format!("Missing Taxon with id {}", id))?; + let taxon = self.entries.get_mut(id).with_context(|| format!("Missing taxon with id {}", id))?; let taxon = match taxon { Some(t) => t, None => return Ok(false), @@ -135,6 +136,95 @@ impl TaxonList { return Ok(taxon.valid); } + pub fn write_taxons(&self, pb: &PathBuf) -> Result<()> { + let mut writer = open_write(pb).context("Unable to open taxon output file")?; + + for (id, taxon) in self.entries.iter().enumerate() { + let taxon = if let Some(t) = taxon { + t + } else { + continue + }; + + writeln!( + &mut writer, + "{}\t{}\t{}\t{}\t{}", + id, taxon.name, taxon.rank.to_string(), taxon.parent, taxon.valid + ).context("Error writing to taxon TSV file")?; + } + + Ok(()) + } + + pub fn write_lineages(&self, pb: &PathBuf) -> Result<()> { + let mut writer = open_write(pb).context("Unable to open lineage output file")?; + let n_ranks = Rank::iter().count(); + + for (i, taxon) in self.entries.iter().enumerate() { + if taxon.is_none() { + continue; + } + + let mut lineage: Vec = Vec::with_capacity(n_ranks); + lineage[0] = i.to_string(); + + let mut tid = self.ranked_ancestor(i)?; + let mut taxon = self.get_taxon_some(tid)?; + let mut valid = taxon.valid; + + for j in ((n_ranks-1)..=1).rev() { + if j > taxon.rank.index() { + lineage[j] = if valid { "null".to_string() } else { "-1".to_string() }; + } else { + valid = taxon.valid; + lineage[j] = (if valid { 1 } else { -1 } * (tid as i32)).to_string(); + tid = self.ranked_ancestor(taxon.parent)?; + taxon = self.get_taxon_some(tid)?; + } + } + + writeln!( + &mut writer, + "{}", + lineage.join("\t") + ).context("Error writing to lineage TSV file")?; + } + + Ok(()) + } + + fn ranked_ancestor(&self, mut tid: usize) -> Result { + let mut taxon = self.get_taxon(tid)?; + let mut pid = usize::MAX; + + // Note: this unwrap() call is safe because of the is_some() beforehand + while taxon.is_some() && tid != pid && taxon.as_ref().unwrap().rank == Rank::NoRank { + pid = tid; + tid = taxon.as_ref().unwrap().parent; + taxon = self.get_taxon(tid)?; + } + + if taxon.is_some() { + return Ok(tid); + } + + Ok(1) // Used in case a taxon is no descendant of the root + } + + fn get_taxon(&self, id: usize) -> Result<&Option> { + self.entries.get(id).with_context(|| format!("Invalid taxon id {}", id)) + } + + /// Similar to get_taxon, but unwraps the Option and gives a reference to the Taxon inside of it + /// This will throw an error if the Taxon is None + fn get_taxon_some(&self, id: usize) -> Result<&Taxon> { + if let Some(t) = self.get_taxon(id)? { + Ok(t) + } else { + Err(Error::msg(format!("Missing taxon with id {}", id))) + } + } + pub fn get(&self, i: usize) -> &Option { &self.entries[i] } diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs index 9f87b5ed..0aa3cbd5 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs @@ -1,6 +1,5 @@ -use std::str::FromStr; - -use anyhow::{Context, Error, Result}; +use strum_macros::{Display, EnumCount, EnumIter, EnumString}; +use anyhow::{Context, Result}; #[derive(Debug)] pub struct Entry { @@ -87,76 +86,45 @@ pub fn calculate_entry_digest( result } -#[derive(Debug, PartialEq)] +// This is taken directly from UMGAP, with Infraclass and Parvorder removed +// Once these changes are merged in UMGAP, this can be replaced with a dependency +// TODO +#[rustfmt::skip] +#[derive(PartialEq, Eq, Debug, Clone, Copy, Display, EnumString, EnumCount, EnumIter)] pub enum Rank { - NoRank, - SuperKingdom, - Kingdom, - SubKingdom, - SuperPhylum, - Phylum, - SubPhylum, - SuperClass, - Class, - SubClass, - SuperOrder, - Order, - SubOrder, - InfraOrder, - SuperFamily, - Family, - SubFamily, - Tribe, - SubTribe, - Genus, - SubGenus, - SpeciesGroup, - SpeciesSubgroup, - Species, - SubSpecies, - Strain, - Varietas, - Forma, + #[strum(serialize="no rank")] NoRank, + #[strum(serialize="superkingdom")] Superkingdom, + #[strum(serialize="kingdom")] Kingdom, + #[strum(serialize="subkingdom")] Subkingdom, + #[strum(serialize="superphylum")] Superphylum, + #[strum(serialize="phylum")] Phylum, + #[strum(serialize="subphylum")] Subphylum, + #[strum(serialize="superclass")] Superclass, + #[strum(serialize="class")] Class, + #[strum(serialize="subclass")] Subclass, + #[strum(serialize="superorder")] Superorder, + #[strum(serialize="order")] Order, + #[strum(serialize="suborder")] Suborder, + #[strum(serialize="infraorder")] Infraorder, + #[strum(serialize="superfamily")] Superfamily, + #[strum(serialize="family")] Family, + #[strum(serialize="subfamily")] Subfamily, + #[strum(serialize="tribe")] Tribe, + #[strum(serialize="subtribe")] Subtribe, + #[strum(serialize="genus")] Genus, + #[strum(serialize="subgenus")] Subgenus, + #[strum(serialize="species group")] SpeciesGroup, + #[strum(serialize="species subgroup")] SpeciesSubgroup, + #[strum(serialize="species")] Species, + #[strum(serialize="subspecies")] Subspecies, + #[strum(serialize="strain")] Strain, + #[strum(serialize="varietas")] Varietas, + #[strum(serialize="forma")] Forma, } -impl FromStr for Rank { - type Err = Error; - - fn from_str(s: &str) -> Result { - match s.to_uppercase().replace(' ', "_").as_str() { - "CLASS" => Ok(Self::Class), - "FAMILY" => Ok(Self::Family), - "FORMA" => Ok(Self::Forma), - "GENUS" => Ok(Self::Genus), - "INFRAORDER" => Ok(Self::InfraOrder), - "KINGDOM" => Ok(Self::Kingdom), - "NO_RANK" => Ok(Self::NoRank), - "ORDER" => Ok(Self::Order), - "PHYLUM" => Ok(Self::Phylum), - "SPECIES" => Ok(Self::Species), - "SPECIES_GROUP" => Ok(Self::SpeciesGroup), - "SPECIES_SUBGROUP" => Ok(Self::SpeciesSubgroup), - "STRAIN" => Ok(Self::Strain), - "SUBCLASS" => Ok(Self::SubClass), - "SUBFAMILY" => Ok(Self::SubFamily), - "SUBGENUS" => Ok(Self::SubGenus), - "SUBKINGDOM" => Ok(Self::SubKingdom), - "SUBORDER" => Ok(Self::SubOrder), - "SUBPHYLUM" => Ok(Self::SubPhylum), - "SUBSPECIES" => Ok(Self::SubSpecies), - "SUBTRIBE" => Ok(Self::SubTribe), - "SUPERCLASS" => Ok(Self::SuperClass), - "SUPERFAMILY" => Ok(Self::SuperFamily), - "SUPERKINGDOM" => Ok(Self::SuperKingdom), - "SUPERORDER" => Ok(Self::SuperOrder), - "SUPERPHYLUM" => Ok(Self::SuperPhylum), - "TRIBE" => Ok(Self::Tribe), - "VARIETAS" => Ok(Self::Varietas), - _ => Err(Error::msg(format!( - "Value {} does not match any known ranks", - s - ))), - } +impl Rank { + pub fn index(&self) -> usize { + *self as usize } } From 4f68219f5e75b155d191d5e351df7fe97f533b61 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Sat, 11 Nov 2023 00:18:22 +0100 Subject: [PATCH 08/18] Bugfixes --- .../src/bin/taxons-lineages.rs | 3 --- .../src/taxons_lineages/taxon_list.rs | 23 ++++++++++--------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs index f3a02e28..e2459369 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs @@ -7,11 +7,8 @@ fn main() -> Result<()>{ let args = Cli::parse(); let mut tl = TaxonList::from_dumps(&args.names, &args.nodes).context("Failed to parse TaxonList from dumps")?; - eprintln!("Done loading dumps"); tl.invalidate().context("Failed to validate TaxonList")?; - eprintln!("Done invalidating"); tl.write_taxons(&args.taxons).context("Failed to write TaxonList")?; - eprintln!("Done writing taxons"); tl.write_lineages(&args.lineages).context("Failed to write lineages")?; Ok(()) diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs index 81fb124c..29e99ef4 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs @@ -11,6 +11,7 @@ use crate::utils::files::{open_read, open_write}; pub struct TaxonList { entries: Vec>, + validation_regex: Regex, } impl TaxonList { @@ -67,12 +68,12 @@ impl TaxonList { Ok(TaxonList { entries, + validation_regex: Regex::new(r".*\d.*").context("Failed to initialize regex")?, }) } pub fn invalidate(&mut self) -> Result<()> { for i in 0..self.entries.len() { - eprintln!("Validating {}", i); self.validate(i)?; } @@ -80,8 +81,6 @@ impl TaxonList { } fn validate(&mut self, id: usize) -> Result { - let re = Regex::new(r".*\\d.*").context("Failed to initialize regex")?; - let taxon = self.entries.get_mut(id).with_context(|| format!("Missing Taxon with id {}", id))?; let taxon = match taxon { Some(t) => t, @@ -91,10 +90,10 @@ impl TaxonList { if !taxon.valid || (taxon.rank == Rank::Species && ( - (re.is_match(taxon.name.as_str()) && !taxon.name.contains("virus")) + (self.validation_regex.is_match(taxon.name.as_str()) && !taxon.name.contains("virus")) || taxon.name.ends_with(" sp.") || taxon.name.ends_with(" genomosp.") - || taxon.name.ends_with(" bacterium") + || taxon.name.contains(" bacterium") ) ) || taxon.name.contains("enrichment culture") @@ -133,7 +132,7 @@ impl TaxonList { taxon.valid = false; } - return Ok(taxon.valid); + Ok(taxon.valid) } pub fn write_taxons(&self, pb: &PathBuf) -> Result<()> { @@ -143,13 +142,15 @@ impl TaxonList { let taxon = if let Some(t) = taxon { t } else { - continue + continue; }; + let valid = if taxon.valid { '\u{0001}' } else { '\u{0000}' }; + writeln!( &mut writer, "{}\t{}\t{}\t{}\t{}", - id, taxon.name, taxon.rank.to_string(), taxon.parent, taxon.valid + id, taxon.name, taxon.rank, taxon.parent, valid ).context("Error writing to taxon TSV file")?; } @@ -165,16 +166,16 @@ impl TaxonList { continue; } - let mut lineage: Vec = Vec::with_capacity(n_ranks); + let mut lineage: Vec = vec![String::from("\\N"); n_ranks]; lineage[0] = i.to_string(); let mut tid = self.ranked_ancestor(i)?; let mut taxon = self.get_taxon_some(tid)?; let mut valid = taxon.valid; - for j in ((n_ranks-1)..=1).rev() { + for j in (1..=(n_ranks - 1)).rev() { if j > taxon.rank.index() { - lineage[j] = if valid { "null".to_string() } else { "-1".to_string() }; + lineage[j] = if valid { "\\N".to_string() } else { "-1".to_string() }; } else { valid = taxon.valid; lineage[j] = (if valid { 1 } else { -1 } * (tid as i32)).to_string(); From f4cbb7f7b2ab847cd79b1a7ecb93c578b1a5f61d Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Tue, 14 Nov 2023 09:48:35 +0100 Subject: [PATCH 09/18] Fix linting --- .../unipept-database-rs/src/bin/lcas.rs | 8 +-- .../src/calculate_lcas/taxonomy.rs | 49 +++++++++++++------ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs index e36f90fc..75b8aa55 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs @@ -1,13 +1,13 @@ -use std::path::PathBuf; +use anyhow::{Context, Result}; use clap::Parser; +use std::path::PathBuf; use unipept_database::calculate_lcas::taxonomy::Taxonomy; use unipept_database::taxons_uniprots_tables::utils::now_str; -use anyhow::{Context, Result}; #[derive(Parser)] struct Cli { #[clap(long)] - infile: PathBuf + infile: PathBuf, } fn main() -> Result<()> { @@ -17,5 +17,5 @@ fn main() -> Result<()> { let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?; eprintln!("{}: reading sequences", now_str()); - Ok(tax.calculate_lcas()?) + tax.calculate_lcas() } diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index 278ce86a..729762b7 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -25,15 +25,20 @@ impl Taxonomy { let mut max = i32::MIN; for line in reader.lines() { - let line = line - .with_context(|| format!("Error reading line from input file {}", infile.display()))?; + let line = line.with_context(|| { + format!("Error reading line from input file {}", infile.display()) + })?; let elements: Vec = line.splitn(28, SEPARATOR).map(String::from).collect(); let key = parse_int(&elements[0])?; // Note on the collect::<> here: "?" can't be used inside of map() as it is a closure // Collecting into a Result> will stop instantly when it receives one Error // https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect - let lineage = elements.iter().skip(1).map(parse_int).collect::>>()?; + let lineage = elements + .iter() + .skip(1) + .map(parse_int) + .collect::>>()?; taxonomy_map.insert(key, lineage); // Keep track of highest key @@ -48,9 +53,7 @@ impl Taxonomy { taxonomy[key as usize] = value; } - Ok(Taxonomy { - taxonomy, - }) + Ok(Taxonomy { taxonomy }) } pub fn calculate_lcas(&self) -> Result<()> { @@ -66,8 +69,12 @@ impl Taxonomy { let line = line.context("error reading line from stdin")?; - let (sequence, taxon_id) = line.split_once(SEPARATOR).context("error splitting line")?; - let taxon_id: i32 = taxon_id.trim_end().parse().context("error parsing taxon id to int")?; + let (sequence, taxon_id) = + line.split_once(SEPARATOR).context("error splitting line")?; + let taxon_id: i32 = taxon_id + .trim_end() + .parse() + .context("error parsing taxon id to int")?; if current_sequence.is_empty() || current_sequence != sequence { if !current_sequence.is_empty() { @@ -81,21 +88,33 @@ impl Taxonomy { taxa.push(taxon_id); } - Ok(self.handle_lca(¤t_sequence, self.calculate_lca(&taxa))) + self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); + Ok(()) } fn calculate_lca(&self, taxa: &[i32]) -> i32 { let mut lca = 1; - let lineages: Vec<&Vec> = taxa.iter().map(|x| &self.taxonomy[*x as usize]).filter(|x| !x.is_empty()).collect(); + let lineages: Vec<&Vec> = taxa + .iter() + .map(|x| &self.taxonomy[*x as usize]) + .filter(|x| !x.is_empty()) + .collect(); for rank in 0..RANKS { let final_rank = rank; let mut value = -1; - let iterator = lineages.iter() + let iterator = lineages + .iter() .map(|&x| x[final_rank as usize]) - .filter(|&x| if final_rank == GENUS || final_rank == SPECIES { x > 0 } else { x >= 0 }); + .filter(|&x| { + if final_rank == GENUS || final_rank == SPECIES { + x > 0 + } else { + x >= 0 + } + }); let mut all_match = true; @@ -105,8 +124,8 @@ impl Taxonomy { if value == -1 { value = item; } else if item != value { - all_match = false; - break; + all_match = false; + break; } } @@ -137,5 +156,5 @@ fn parse_int(s: &String) -> Result { return Ok(0); } - Ok(s.parse::().with_context(|| format!("Error parsing {} as an integer", s))?) + s.parse::().with_context(|| format!("Error parsing {} as an integer", s)) } From 1bdd24fced77448985ca17ee2c4d629a9c1ef3d7 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Tue, 14 Nov 2023 09:48:35 +0100 Subject: [PATCH 10/18] Fix linting --- .../unipept-database-rs/src/bin/lcas.rs | 8 +-- .../src/calculate_lcas/taxonomy.rs | 49 +++++++++++++------ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs index e36f90fc..75b8aa55 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs @@ -1,13 +1,13 @@ -use std::path::PathBuf; +use anyhow::{Context, Result}; use clap::Parser; +use std::path::PathBuf; use unipept_database::calculate_lcas::taxonomy::Taxonomy; use unipept_database::taxons_uniprots_tables::utils::now_str; -use anyhow::{Context, Result}; #[derive(Parser)] struct Cli { #[clap(long)] - infile: PathBuf + infile: PathBuf, } fn main() -> Result<()> { @@ -17,5 +17,5 @@ fn main() -> Result<()> { let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?; eprintln!("{}: reading sequences", now_str()); - Ok(tax.calculate_lcas()?) + tax.calculate_lcas() } diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index 278ce86a..729762b7 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -25,15 +25,20 @@ impl Taxonomy { let mut max = i32::MIN; for line in reader.lines() { - let line = line - .with_context(|| format!("Error reading line from input file {}", infile.display()))?; + let line = line.with_context(|| { + format!("Error reading line from input file {}", infile.display()) + })?; let elements: Vec = line.splitn(28, SEPARATOR).map(String::from).collect(); let key = parse_int(&elements[0])?; // Note on the collect::<> here: "?" can't be used inside of map() as it is a closure // Collecting into a Result> will stop instantly when it receives one Error // https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect - let lineage = elements.iter().skip(1).map(parse_int).collect::>>()?; + let lineage = elements + .iter() + .skip(1) + .map(parse_int) + .collect::>>()?; taxonomy_map.insert(key, lineage); // Keep track of highest key @@ -48,9 +53,7 @@ impl Taxonomy { taxonomy[key as usize] = value; } - Ok(Taxonomy { - taxonomy, - }) + Ok(Taxonomy { taxonomy }) } pub fn calculate_lcas(&self) -> Result<()> { @@ -66,8 +69,12 @@ impl Taxonomy { let line = line.context("error reading line from stdin")?; - let (sequence, taxon_id) = line.split_once(SEPARATOR).context("error splitting line")?; - let taxon_id: i32 = taxon_id.trim_end().parse().context("error parsing taxon id to int")?; + let (sequence, taxon_id) = + line.split_once(SEPARATOR).context("error splitting line")?; + let taxon_id: i32 = taxon_id + .trim_end() + .parse() + .context("error parsing taxon id to int")?; if current_sequence.is_empty() || current_sequence != sequence { if !current_sequence.is_empty() { @@ -81,21 +88,33 @@ impl Taxonomy { taxa.push(taxon_id); } - Ok(self.handle_lca(¤t_sequence, self.calculate_lca(&taxa))) + self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); + Ok(()) } fn calculate_lca(&self, taxa: &[i32]) -> i32 { let mut lca = 1; - let lineages: Vec<&Vec> = taxa.iter().map(|x| &self.taxonomy[*x as usize]).filter(|x| !x.is_empty()).collect(); + let lineages: Vec<&Vec> = taxa + .iter() + .map(|x| &self.taxonomy[*x as usize]) + .filter(|x| !x.is_empty()) + .collect(); for rank in 0..RANKS { let final_rank = rank; let mut value = -1; - let iterator = lineages.iter() + let iterator = lineages + .iter() .map(|&x| x[final_rank as usize]) - .filter(|&x| if final_rank == GENUS || final_rank == SPECIES { x > 0 } else { x >= 0 }); + .filter(|&x| { + if final_rank == GENUS || final_rank == SPECIES { + x > 0 + } else { + x >= 0 + } + }); let mut all_match = true; @@ -105,8 +124,8 @@ impl Taxonomy { if value == -1 { value = item; } else if item != value { - all_match = false; - break; + all_match = false; + break; } } @@ -137,5 +156,5 @@ fn parse_int(s: &String) -> Result { return Ok(0); } - Ok(s.parse::().with_context(|| format!("Error parsing {} as an integer", s))?) + s.parse::().with_context(|| format!("Error parsing {} as an integer", s)) } From 77b9d7c17266ddac49ac6722314e495bc6a732fe Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Tue, 14 Nov 2023 09:49:11 +0100 Subject: [PATCH 11/18] Fix linting --- .../src/bin/taxons-lineages.rs | 15 +++-- .../src/calculate_lcas/taxonomy.rs | 3 +- .../src/taxons_lineages/taxon_list.rs | 61 +++++++++++-------- .../src/taxons_uniprots_tables/models.rs | 2 +- 4 files changed, 47 insertions(+), 34 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs index e2459369..8fca8d73 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs @@ -1,15 +1,18 @@ -use std::path::PathBuf; +use anyhow::{Context, Result}; use clap::Parser; +use std::path::PathBuf; use unipept_database::taxons_lineages::taxon_list::TaxonList; -use anyhow::{Context, Result}; -fn main() -> Result<()>{ +fn main() -> Result<()> { let args = Cli::parse(); - let mut tl = TaxonList::from_dumps(&args.names, &args.nodes).context("Failed to parse TaxonList from dumps")?; + let mut tl = TaxonList::from_dumps(&args.names, &args.nodes) + .context("Failed to parse TaxonList from dumps")?; tl.invalidate().context("Failed to validate TaxonList")?; - tl.write_taxons(&args.taxons).context("Failed to write TaxonList")?; - tl.write_lineages(&args.lineages).context("Failed to write lineages")?; + tl.write_taxons(&args.taxons) + .context("Failed to write TaxonList")?; + tl.write_lineages(&args.lineages) + .context("Failed to write lineages")?; Ok(()) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index 729762b7..719a46cc 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -156,5 +156,6 @@ fn parse_int(s: &String) -> Result { return Ok(0); } - s.parse::().with_context(|| format!("Error parsing {} as an integer", s)) + s.parse::() + .with_context(|| format!("Error parsing {} as an integer", s)) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs index 29e99ef4..e197f228 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs @@ -55,14 +55,12 @@ impl TaxonList { entries.push(None); } - entries[taxon_id] = Some(Taxon::new( - name, - rank, - parent_id, - true, - )); + entries[taxon_id] = Some(Taxon::new(name, rank, parent_id, true)); } else { - return Err(Error::msg(format!("Taxon {} did not have a scientific name", taxon_id))); + return Err(Error::msg(format!( + "Taxon {} did not have a scientific name", + taxon_id + ))); } } @@ -81,7 +79,10 @@ impl TaxonList { } fn validate(&mut self, id: usize) -> Result { - let taxon = self.entries.get_mut(id).with_context(|| format!("Missing Taxon with id {}", id))?; + let taxon = self + .entries + .get_mut(id) + .with_context(|| format!("Missing Taxon with id {}", id))?; let taxon = match taxon { Some(t) => t, None => return Ok(false), @@ -89,13 +90,11 @@ impl TaxonList { if !taxon.valid || (taxon.rank == Rank::Species - && ( - (self.validation_regex.is_match(taxon.name.as_str()) && !taxon.name.contains("virus")) - || taxon.name.ends_with(" sp.") - || taxon.name.ends_with(" genomosp.") - || taxon.name.contains(" bacterium") - ) - ) + && ((self.validation_regex.is_match(taxon.name.as_str()) + && !taxon.name.contains("virus")) + || taxon.name.ends_with(" sp.") + || taxon.name.ends_with(" genomosp.") + || taxon.name.contains(" bacterium"))) || taxon.name.contains("enrichment culture") || taxon.name.contains("mixed culture") || taxon.name.contains("uncultured") @@ -107,7 +106,8 @@ impl TaxonList { || taxon.name.ends_with("library") || id == 28384 || id == 48479 - || id == 1869227 { + || id == 1869227 + { taxon.valid = false; return Ok(false); } @@ -122,7 +122,10 @@ impl TaxonList { // I don't like this duplication but we have to do it because of the borrow checker // Otherwise, the recursive call above ^ will cause two mutable references at the same time // And we need one to mark the taxon as invalid - let taxon = self.entries.get_mut(id).with_context(|| format!("Missing taxon with id {}", id))?; + let taxon = self + .entries + .get_mut(id) + .with_context(|| format!("Missing taxon with id {}", id))?; let taxon = match taxon { Some(t) => t, None => return Ok(false), @@ -151,7 +154,8 @@ impl TaxonList { &mut writer, "{}\t{}\t{}\t{}\t{}", id, taxon.name, taxon.rank, taxon.parent, valid - ).context("Error writing to taxon TSV file")?; + ) + .context("Error writing to taxon TSV file")?; } Ok(()) @@ -175,7 +179,11 @@ impl TaxonList { for j in (1..=(n_ranks - 1)).rev() { if j > taxon.rank.index() { - lineage[j] = if valid { "\\N".to_string() } else { "-1".to_string() }; + lineage[j] = if valid { + "\\N".to_string() + } else { + "-1".to_string() + }; } else { valid = taxon.valid; lineage[j] = (if valid { 1 } else { -1 } * (tid as i32)).to_string(); @@ -184,11 +192,8 @@ impl TaxonList { } } - writeln!( - &mut writer, - "{}", - lineage.join("\t") - ).context("Error writing to lineage TSV file")?; + writeln!(&mut writer, "{}", lineage.join("\t")) + .context("Error writing to lineage TSV file")?; } Ok(()) @@ -213,7 +218,9 @@ impl TaxonList { } fn get_taxon(&self, id: usize) -> Result<&Option> { - self.entries.get(id).with_context(|| format!("Invalid taxon id {}", id)) + self.entries + .get(id) + .with_context(|| format!("Invalid taxon id {}", id)) } /// Similar to get_taxon, but unwraps the Option and gives a reference to the Taxon inside of it @@ -240,5 +247,7 @@ impl TaxonList { } fn parse_id(v: &str) -> Result { - v.trim().parse::().with_context(|| format!("Unable to parse {} as usize", v)) + v.trim() + .parse::() + .with_context(|| format!("Unable to parse {} as usize", v)) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs index 0aa3cbd5..6f3298d6 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs @@ -1,5 +1,5 @@ -use strum_macros::{Display, EnumCount, EnumIter, EnumString}; use anyhow::{Context, Result}; +use strum_macros::{Display, EnumCount, EnumIter, EnumString}; #[derive(Debug)] pub struct Entry { From a04f409e549ba2f5c28a5294ebed30e8bd585c42 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Tue, 14 Nov 2023 09:51:43 +0100 Subject: [PATCH 12/18] Formatting --- .../unipept-database-rs/src/calculate_lcas/taxonomy.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index 729762b7..719a46cc 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -156,5 +156,6 @@ fn parse_int(s: &String) -> Result { return Ok(0); } - s.parse::().with_context(|| format!("Error parsing {} as an integer", s)) + s.parse::() + .with_context(|| format!("Error parsing {} as an integer", s)) } From ae56c7130733b32c8cc033fec85406d61b59b463 Mon Sep 17 00:00:00 2001 From: Stijn De Clercq Date: Tue, 14 Nov 2023 09:51:43 +0100 Subject: [PATCH 13/18] Formatting --- .../unipept-database-rs/src/calculate_lcas/taxonomy.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index 729762b7..719a46cc 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -156,5 +156,6 @@ fn parse_int(s: &String) -> Result { return Ok(0); } - s.parse::().with_context(|| format!("Error parsing {} as an integer", s)) + s.parse::() + .with_context(|| format!("Error parsing {} as an integer", s)) } From eeeb0e1810fbbcf4add5057f3f041ff3abf53a6c Mon Sep 17 00:00:00 2001 From: stijndcl Date: Tue, 28 Nov 2023 10:27:21 +0100 Subject: [PATCH 14/18] Apply suggestions by Rien --- .../src/calculate_lcas/taxonomy.rs | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index 719a46cc..ad263bff 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -10,7 +10,7 @@ use crate::utils::files::{open_read, open_sin}; const GENUS: u8 = 18; const RANKS: u8 = 27; const SPECIES: u8 = 22; -const NULL: &str = "\\N"; +const NULL_STRING: &str = "\\N"; const SEPARATOR: &str = "\t"; pub struct Taxonomy { @@ -28,17 +28,15 @@ impl Taxonomy { let line = line.with_context(|| { format!("Error reading line from input file {}", infile.display()) })?; - let elements: Vec = line.splitn(28, SEPARATOR).map(String::from).collect(); + let mut elements = line.splitn(28, SEPARATOR).map(parse_int); + let key = elements + .next() + .context("Unable to access key at first index of line")??; - let key = parse_int(&elements[0])?; // Note on the collect::<> here: "?" can't be used inside of map() as it is a closure // Collecting into a Result> will stop instantly when it receives one Error // https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect - let lineage = elements - .iter() - .skip(1) - .map(parse_int) - .collect::>>()?; + let lineage = elements.collect::>>()?; taxonomy_map.insert(key, lineage); // Keep track of highest key @@ -97,8 +95,14 @@ impl Taxonomy { let lineages: Vec<&Vec> = taxa .iter() - .map(|x| &self.taxonomy[*x as usize]) - .filter(|x| !x.is_empty()) + .filter_map(|x| { + let result = &self.taxonomy[*x as usize]; + if !result.is_empty() { + Some(result) + } else { + None + } + }) .collect(); for rank in 0..RANKS { @@ -116,30 +120,20 @@ impl Taxonomy { } }); - let mut all_match = true; - + // Check if all elements in the iterator are the same // This was near-impossible to do with the iterators above, // so we're using a simplified loop here for item in iterator { if value == -1 { value = item; } else if item != value { - all_match = false; - break; + return lca; } } // If we found a new value that matched for all of them, use this as the new best - if value != -1 { - // If not everything matched, this is not a common ancestor anymore, - // so we can stop - if !all_match { - break; - } - - if value != 0 { - lca = value; - } + if value > 0 { + lca = value; } } @@ -151,8 +145,8 @@ impl Taxonomy { } } -fn parse_int(s: &String) -> Result { - if s == NULL { +fn parse_int(s: &str) -> Result { + if s == NULL_STRING { return Ok(0); } From 0675be3fb84cea019ac182ed72807b8e49771a3c Mon Sep 17 00:00:00 2001 From: stijndcl Date: Tue, 28 Nov 2023 12:16:44 +0100 Subject: [PATCH 15/18] Revert change --- .../unipept-database-rs/src/calculate_lcas/taxonomy.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs index ad263bff..6937b96c 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -95,14 +95,8 @@ impl Taxonomy { let lineages: Vec<&Vec> = taxa .iter() - .filter_map(|x| { - let result = &self.taxonomy[*x as usize]; - if !result.is_empty() { - Some(result) - } else { - None - } - }) + .map(|x| &self.taxonomy[*x as usize]) + .filter(|x| !x.is_empty()) .collect(); for rank in 0..RANKS { From 9ebb206b50a77f285c4cc1f3d18fec18338f4c05 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Tue, 5 Dec 2023 15:25:06 +0100 Subject: [PATCH 16/18] Bump db to 2023-12-05 --- workflows/static_database/version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/static_database/version.txt b/workflows/static_database/version.txt index a788ffb2..0b805422 100644 --- a/workflows/static_database/version.txt +++ b/workflows/static_database/version.txt @@ -1 +1 @@ -2023-11-01 +2023-12-05 From 57c256e2fed10fd134f5b7983fa38bc2c4d6ca9f Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 6 Dec 2023 09:16:42 +0100 Subject: [PATCH 17/18] Update workflow for static database creation --- .github/workflows/static_database.yml | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index d1b22d1a..12aed147 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -44,18 +44,7 @@ jobs: # Compress the database before uploading it to a Github release zip output.zip output.db - - name: Update database versioning - shell: bash - run: | - rm workflows/static_database/version.txt - echo "${{ steps.date.outputs.date }}" > workflows/static_database/version.txt - - name: Update resources - uses: test-room-7/action-update-file@v1 - with: - file-path: workflows/static_database/version.txt - commit-msg: Bump db version to ${{ steps.date.outputs.date }} - github-token: ${{ secrets.GITHUB_TOKEN }} - - name: Get newly made commit sha + - name: Get latest made commit sha id: commit_sha shell: bash run: | From 44120bfa73ed85c4b738da8b9b324e163fe37de9 Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Wed, 6 Dec 2023 09:24:54 +0100 Subject: [PATCH 18/18] Switch tag creation action --- .github/workflows/static_database.yml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 12aed147..84cde2f1 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -44,24 +44,12 @@ jobs: # Compress the database before uploading it to a Github release zip output.zip output.db - - name: Get latest made commit sha - id: commit_sha - shell: bash - run: | - echo "::set-output name=sha::$(git rev-parse HEAD)" - name: Create new tag - uses: octokit/request-action@v2.x - id: create_new_tag + uses: rickstaa/action-create-tag@v1 + id: "tag_create" with: - route: POST /repos/:owner/:repo/git/tags - owner: unipept - repo: make-database tag: database-${{ steps.date.outputs.date }} message: "Static information database built on ${{ steps.date.outputs.date }}" - object: ${{ steps.commit_sha.outputs.sha }} - type: commit - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Create Release id: create_release uses: actions/create-release@v1