diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index d1b22d1a..84cde2f1 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -44,35 +44,12 @@ jobs: # Compress the database before uploading it to a Github release zip output.zip output.db - - name: Update database versioning - shell: bash - run: | - rm workflows/static_database/version.txt - echo "${{ steps.date.outputs.date }}" > workflows/static_database/version.txt - - name: Update resources - uses: test-room-7/action-update-file@v1 - with: - file-path: workflows/static_database/version.txt - commit-msg: Bump db version to ${{ steps.date.outputs.date }} - github-token: ${{ secrets.GITHUB_TOKEN }} - - name: Get newly made commit sha - id: commit_sha - shell: bash - run: | - echo "::set-output name=sha::$(git rev-parse HEAD)" - name: Create new tag - uses: octokit/request-action@v2.x - id: create_new_tag + uses: rickstaa/action-create-tag@v1 + id: "tag_create" with: - route: POST /repos/:owner/:repo/git/tags - owner: unipept - repo: make-database tag: database-${{ steps.date.outputs.date }} message: "Static information database built on ${{ steps.date.outputs.date }}" - object: ${{ steps.commit_sha.outputs.sha }} - type: commit - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Create Release id: create_release uses: actions/create-release@v1 diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.lock b/scripts/helper_scripts/unipept-database-rs/Cargo.lock index af2c0268..2a5816d6 100644 --- a/scripts/helper_scripts/unipept-database-rs/Cargo.lock +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -309,6 +318,41 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + [[package]] name = "smartstring" version = "1.0.1" @@ -332,6 +376,25 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "syn" version = "2.0.37" @@ -357,7 +420,10 @@ dependencies = [ "bit-vec", "chrono", "clap", + "regex", "smartstring", + "strum", + "strum_macros", "uniprot", ] diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.toml b/scripts/helper_scripts/unipept-database-rs/Cargo.toml index 07f6cfdc..e83af45c 100644 --- a/scripts/helper_scripts/unipept-database-rs/Cargo.toml +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.toml @@ -15,5 +15,8 @@ anyhow = "1.0.75" bit-vec = "0.6.3" chrono = "0.4.31" clap = { version = "4.4.6", features = ["derive"] } +regex = "1.10.2" smartstring = { version = "1.0" } +strum = "0.25.0" +strum_macros = "0.25.3" uniprot = "0.7.0" diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs new file mode 100644 index 00000000..75b8aa55 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs @@ -0,0 +1,21 @@ +use anyhow::{Context, Result}; +use clap::Parser; +use std::path::PathBuf; +use unipept_database::calculate_lcas::taxonomy::Taxonomy; +use unipept_database::taxons_uniprots_tables::utils::now_str; + +#[derive(Parser)] +struct Cli { + #[clap(long)] + infile: PathBuf, +} + +fn main() -> Result<()> { + let args = Cli::parse(); + + eprintln!("{}: reading taxonomy", now_str()); + let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?; + + eprintln!("{}: reading sequences", now_str()); + tax.calculate_lcas() +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs new file mode 100644 index 00000000..8fca8d73 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs @@ -0,0 +1,30 @@ +use anyhow::{Context, Result}; +use clap::Parser; +use std::path::PathBuf; +use unipept_database::taxons_lineages::taxon_list::TaxonList; + +fn main() -> Result<()> { + let args = Cli::parse(); + + let mut tl = TaxonList::from_dumps(&args.names, &args.nodes) + .context("Failed to parse TaxonList from dumps")?; + tl.invalidate().context("Failed to validate TaxonList")?; + tl.write_taxons(&args.taxons) + .context("Failed to write TaxonList")?; + tl.write_lineages(&args.lineages) + .context("Failed to write lineages")?; + + Ok(()) +} + +#[derive(Parser, Debug)] +struct Cli { + #[clap(short, long)] + names: PathBuf, + #[clap(short, long)] + nodes: PathBuf, + #[clap(short, long)] + taxons: PathBuf, + #[clap(short, long)] + lineages: PathBuf, +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs new file mode 100644 index 00000000..652b0277 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs @@ -0,0 +1 @@ +pub mod taxonomy; diff --git a/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs new file mode 100644 index 00000000..6937b96c --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs @@ -0,0 +1,149 @@ +use std::collections::HashMap; +use std::io::BufRead; +use std::path::PathBuf; + +use anyhow::{Context, Result}; + +use crate::taxons_uniprots_tables::utils::now_str; +use crate::utils::files::{open_read, open_sin}; + +const GENUS: u8 = 18; +const RANKS: u8 = 27; +const SPECIES: u8 = 22; +const NULL_STRING: &str = "\\N"; +const SEPARATOR: &str = "\t"; + +pub struct Taxonomy { + taxonomy: Vec>, +} + +impl Taxonomy { + pub fn build(infile: &PathBuf) -> Result { + let mut taxonomy_map: HashMap> = HashMap::new(); + let reader = open_read(infile).context("Unable to open input file")?; + + let mut max = i32::MIN; + + for line in reader.lines() { + let line = line.with_context(|| { + format!("Error reading line from input file {}", infile.display()) + })?; + let mut elements = line.splitn(28, SEPARATOR).map(parse_int); + let key = elements + .next() + .context("Unable to access key at first index of line")??; + + // Note on the collect::<> here: "?" can't be used inside of map() as it is a closure + // Collecting into a Result> will stop instantly when it receives one Error + // https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect + let lineage = elements.collect::>>()?; + taxonomy_map.insert(key, lineage); + + // Keep track of highest key + if key > max { + max = key; + } + } + + let mut taxonomy = vec![Vec::new(); (max + 1) as usize]; + + for (key, value) in taxonomy_map { + taxonomy[key as usize] = value; + } + + Ok(Taxonomy { taxonomy }) + } + + pub fn calculate_lcas(&self) -> Result<()> { + let reader = open_sin(); + + let mut current_sequence = String::new(); + let mut taxa: Vec = Vec::new(); + + for (i, line) in reader.lines().enumerate() { + if i % 10000000 == 0 && i != 0 { + eprintln!("{}: {}", now_str(), i); + } + + let line = line.context("error reading line from stdin")?; + + let (sequence, taxon_id) = + line.split_once(SEPARATOR).context("error splitting line")?; + let taxon_id: i32 = taxon_id + .trim_end() + .parse() + .context("error parsing taxon id to int")?; + + if current_sequence.is_empty() || current_sequence != sequence { + if !current_sequence.is_empty() { + self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); + } + + current_sequence = sequence.to_string(); + taxa.clear(); + } + + taxa.push(taxon_id); + } + + self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); + Ok(()) + } + + fn calculate_lca(&self, taxa: &[i32]) -> i32 { + let mut lca = 1; + + let lineages: Vec<&Vec> = taxa + .iter() + .map(|x| &self.taxonomy[*x as usize]) + .filter(|x| !x.is_empty()) + .collect(); + + for rank in 0..RANKS { + let final_rank = rank; + let mut value = -1; + + let iterator = lineages + .iter() + .map(|&x| x[final_rank as usize]) + .filter(|&x| { + if final_rank == GENUS || final_rank == SPECIES { + x > 0 + } else { + x >= 0 + } + }); + + // Check if all elements in the iterator are the same + // This was near-impossible to do with the iterators above, + // so we're using a simplified loop here + for item in iterator { + if value == -1 { + value = item; + } else if item != value { + return lca; + } + } + + // If we found a new value that matched for all of them, use this as the new best + if value > 0 { + lca = value; + } + } + + lca + } + + fn handle_lca(&self, sequence: &String, lca: i32) { + println!("{}\t{}", sequence, lca); + } +} + +fn parse_int(s: &str) -> Result { + if s == NULL_STRING { + return Ok(0); + } + + s.parse::() + .with_context(|| format!("Error parsing {} as an integer", s)) +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs index 262d6198..9200454d 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/lib.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs @@ -1,2 +1,4 @@ +pub mod calculate_lcas; +pub mod taxons_lineages; pub mod taxons_uniprots_tables; pub mod utils; diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs new file mode 100644 index 00000000..552415d3 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs @@ -0,0 +1 @@ +pub mod taxon_list; diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs new file mode 100644 index 00000000..e197f228 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/taxon_list.rs @@ -0,0 +1,253 @@ +use std::io::{BufRead, Read, Write}; +use std::path::PathBuf; +use std::str::FromStr; + +use anyhow::{Context, Error, Result}; +use regex::Regex; +use strum::IntoEnumIterator; + +use crate::taxons_uniprots_tables::models::{Rank, Taxon}; +use crate::utils::files::{open_read, open_write}; + +pub struct TaxonList { + entries: Vec>, + validation_regex: Regex, +} + +impl TaxonList { + /// Parse a list of Taxons from the names and nodes dumps + pub fn from_dumps(names_pb: &PathBuf, nodes_pb: &PathBuf) -> Result { + let scientific_name = "scientific name"; + let pattern = "|"; + + let mut entries = vec![]; + + let mut names = open_read(names_pb).context("Unable to open names dump file")?; + let nodes = open_read(nodes_pb).context("Unable to open nodes dump file")?; + + for node_line in nodes.lines() { + let node_line = node_line.context("Error reading line from nodes dump file")?; + let node_row: Vec<&str> = node_line.split(pattern).collect(); + + let taxon_id = parse_id(node_row[0])?; + let parent_id = parse_id(node_row[1])?; + + let rank = Rank::from_str(node_row[2].trim()).context("Unable to parse Taxon Rank")?; + + let mut name = String::new(); + let mut clas = String::new(); + let mut taxon_id2 = usize::MAX; + + for name_line in names.by_ref().lines() { + let name_line = name_line.context("Error reading line from names dump file")?; + let name_row: Vec<&str> = name_line.split(pattern).collect(); + taxon_id2 = parse_id(name_row[0])?; + name = name_row[1].trim().to_string(); + clas = name_row[3].trim().to_string(); + + if clas == scientific_name { + break; + } + } + + if clas == scientific_name && taxon_id == taxon_id2 { + while entries.len() <= taxon_id { + entries.push(None); + } + + entries[taxon_id] = Some(Taxon::new(name, rank, parent_id, true)); + } else { + return Err(Error::msg(format!( + "Taxon {} did not have a scientific name", + taxon_id + ))); + } + } + + Ok(TaxonList { + entries, + validation_regex: Regex::new(r".*\d.*").context("Failed to initialize regex")?, + }) + } + + pub fn invalidate(&mut self) -> Result<()> { + for i in 0..self.entries.len() { + self.validate(i)?; + } + + Ok(()) + } + + fn validate(&mut self, id: usize) -> Result { + let taxon = self + .entries + .get_mut(id) + .with_context(|| format!("Missing Taxon with id {}", id))?; + let taxon = match taxon { + Some(t) => t, + None => return Ok(false), + }; + + if !taxon.valid + || (taxon.rank == Rank::Species + && ((self.validation_regex.is_match(taxon.name.as_str()) + && !taxon.name.contains("virus")) + || taxon.name.ends_with(" sp.") + || taxon.name.ends_with(" genomosp.") + || taxon.name.contains(" bacterium"))) + || taxon.name.contains("enrichment culture") + || taxon.name.contains("mixed culture") + || taxon.name.contains("uncultured") + || taxon.name.contains("unidentified") + || taxon.name.contains("unspecified") + || taxon.name.contains("undetermined") + || taxon.name.contains("sample") + || taxon.name.ends_with("metagenome") + || taxon.name.ends_with("library") + || id == 28384 + || id == 48479 + || id == 1869227 + { + taxon.valid = false; + return Ok(false); + } + + if id == 1 { + return Ok(true); + } + + let parent = taxon.parent; + let parent_valid = self.validate(parent)?; + + // I don't like this duplication but we have to do it because of the borrow checker + // Otherwise, the recursive call above ^ will cause two mutable references at the same time + // And we need one to mark the taxon as invalid + let taxon = self + .entries + .get_mut(id) + .with_context(|| format!("Missing taxon with id {}", id))?; + let taxon = match taxon { + Some(t) => t, + None => return Ok(false), + }; + + if !parent_valid { + taxon.valid = false; + } + + Ok(taxon.valid) + } + + pub fn write_taxons(&self, pb: &PathBuf) -> Result<()> { + let mut writer = open_write(pb).context("Unable to open taxon output file")?; + + for (id, taxon) in self.entries.iter().enumerate() { + let taxon = if let Some(t) = taxon { + t + } else { + continue; + }; + + let valid = if taxon.valid { '\u{0001}' } else { '\u{0000}' }; + + writeln!( + &mut writer, + "{}\t{}\t{}\t{}\t{}", + id, taxon.name, taxon.rank, taxon.parent, valid + ) + .context("Error writing to taxon TSV file")?; + } + + Ok(()) + } + + pub fn write_lineages(&self, pb: &PathBuf) -> Result<()> { + let mut writer = open_write(pb).context("Unable to open lineage output file")?; + let n_ranks = Rank::iter().count(); + + for (i, taxon) in self.entries.iter().enumerate() { + if taxon.is_none() { + continue; + } + + let mut lineage: Vec = vec![String::from("\\N"); n_ranks]; + lineage[0] = i.to_string(); + + let mut tid = self.ranked_ancestor(i)?; + let mut taxon = self.get_taxon_some(tid)?; + let mut valid = taxon.valid; + + for j in (1..=(n_ranks - 1)).rev() { + if j > taxon.rank.index() { + lineage[j] = if valid { + "\\N".to_string() + } else { + "-1".to_string() + }; + } else { + valid = taxon.valid; + lineage[j] = (if valid { 1 } else { -1 } * (tid as i32)).to_string(); + tid = self.ranked_ancestor(taxon.parent)?; + taxon = self.get_taxon_some(tid)?; + } + } + + writeln!(&mut writer, "{}", lineage.join("\t")) + .context("Error writing to lineage TSV file")?; + } + + Ok(()) + } + + fn ranked_ancestor(&self, mut tid: usize) -> Result { + let mut taxon = self.get_taxon(tid)?; + let mut pid = usize::MAX; + + // Note: this unwrap() call is safe because of the is_some() beforehand + while taxon.is_some() && tid != pid && taxon.as_ref().unwrap().rank == Rank::NoRank { + pid = tid; + tid = taxon.as_ref().unwrap().parent; + taxon = self.get_taxon(tid)?; + } + + if taxon.is_some() { + return Ok(tid); + } + + Ok(1) // Used in case a taxon is no descendant of the root + } + + fn get_taxon(&self, id: usize) -> Result<&Option> { + self.entries + .get(id) + .with_context(|| format!("Invalid taxon id {}", id)) + } + + /// Similar to get_taxon, but unwraps the Option and gives a reference to the Taxon inside of it + /// This will throw an error if the Taxon is None + fn get_taxon_some(&self, id: usize) -> Result<&Taxon> { + if let Some(t) = self.get_taxon(id)? { + Ok(t) + } else { + Err(Error::msg(format!("Missing taxon with id {}", id))) + } + } + + pub fn get(&self, i: usize) -> &Option { + &self.entries[i] + } + + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + pub fn len(&self) -> usize { + self.entries.len() + } +} + +fn parse_id(v: &str) -> Result { + v.trim() + .parse::() + .with_context(|| format!("Unable to parse {} as usize", v)) +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs index a0c70214..6f3298d6 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs @@ -1,6 +1,5 @@ -use std::str::FromStr; - -use anyhow::{Context, Error, Result}; +use anyhow::{Context, Result}; +use strum_macros::{Display, EnumCount, EnumIter, EnumString}; #[derive(Debug)] pub struct Entry { @@ -87,86 +86,54 @@ pub fn calculate_entry_digest( result } -#[derive(Debug)] +// This is taken directly from UMGAP, with Infraclass and Parvorder removed +// Once these changes are merged in UMGAP, this can be replaced with a dependency +// TODO +#[rustfmt::skip] +#[derive(PartialEq, Eq, Debug, Clone, Copy, Display, EnumString, EnumCount, EnumIter)] pub enum Rank { - NoRank, - SuperKingdom, - Kingdom, - SubKingdom, - SuperPhylum, - Phylum, - SubPhylum, - SuperClass, - Class, - SubClass, - SuperOrder, - Order, - SubOrder, - InfraOrder, - SuperFamily, - Family, - SubFamily, - Tribe, - SubTribe, - Genus, - SubGenus, - SpeciesGroup, - SpeciesSubgroup, - Species, - SubSpecies, - Strain, - Varietas, - Forma, + #[strum(serialize="no rank")] NoRank, + #[strum(serialize="superkingdom")] Superkingdom, + #[strum(serialize="kingdom")] Kingdom, + #[strum(serialize="subkingdom")] Subkingdom, + #[strum(serialize="superphylum")] Superphylum, + #[strum(serialize="phylum")] Phylum, + #[strum(serialize="subphylum")] Subphylum, + #[strum(serialize="superclass")] Superclass, + #[strum(serialize="class")] Class, + #[strum(serialize="subclass")] Subclass, + #[strum(serialize="superorder")] Superorder, + #[strum(serialize="order")] Order, + #[strum(serialize="suborder")] Suborder, + #[strum(serialize="infraorder")] Infraorder, + #[strum(serialize="superfamily")] Superfamily, + #[strum(serialize="family")] Family, + #[strum(serialize="subfamily")] Subfamily, + #[strum(serialize="tribe")] Tribe, + #[strum(serialize="subtribe")] Subtribe, + #[strum(serialize="genus")] Genus, + #[strum(serialize="subgenus")] Subgenus, + #[strum(serialize="species group")] SpeciesGroup, + #[strum(serialize="species subgroup")] SpeciesSubgroup, + #[strum(serialize="species")] Species, + #[strum(serialize="subspecies")] Subspecies, + #[strum(serialize="strain")] Strain, + #[strum(serialize="varietas")] Varietas, + #[strum(serialize="forma")] Forma, } -impl FromStr for Rank { - type Err = Error; - - fn from_str(s: &str) -> Result { - match s.to_uppercase().replace(' ', "_").as_str() { - "CLASS" => Ok(Self::Class), - "FAMILY" => Ok(Self::Family), - "FORMA" => Ok(Self::Forma), - "GENUS" => Ok(Self::Genus), - "INFRAORDER" => Ok(Self::InfraOrder), - "KINGDOM" => Ok(Self::Kingdom), - "NO_RANK" => Ok(Self::NoRank), - "ORDER" => Ok(Self::Order), - "PHYLUM" => Ok(Self::Phylum), - "SPECIES" => Ok(Self::Species), - "SPECIES_GROUP" => Ok(Self::SpeciesGroup), - "SPECIES_SUBGROUP" => Ok(Self::SpeciesSubgroup), - "STRAIN" => Ok(Self::Strain), - "SUBCLASS" => Ok(Self::SubClass), - "SUBFAMILY" => Ok(Self::SubFamily), - "SUBGENUS" => Ok(Self::SubGenus), - "SUBKINGDOM" => Ok(Self::SubKingdom), - "SUBORDER" => Ok(Self::SubOrder), - "SUBPHYLUM" => Ok(Self::SubPhylum), - "SUBSPECIES" => Ok(Self::SubSpecies), - "SUBTRIBE" => Ok(Self::SubTribe), - "SUPERCLASS" => Ok(Self::SuperClass), - "SUPERFAMILY" => Ok(Self::SuperFamily), - "SUPERKINGDOM" => Ok(Self::SuperKingdom), - "SUPERORDER" => Ok(Self::SuperOrder), - "SUPERPHYLUM" => Ok(Self::SuperPhylum), - "TRIBE" => Ok(Self::Tribe), - "VARIETAS" => Ok(Self::Varietas), - _ => Err(Error::msg(format!( - "Value {} does not match any known ranks", - s - ))), - } +impl Rank { + pub fn index(&self) -> usize { + *self as usize } } -#[allow(dead_code)] // The fields in this struct aren't used YET, but will be later on #[derive(Debug)] pub struct Taxon { - name: String, - rank: Rank, - parent: usize, - valid: bool, + pub name: String, + pub rank: Rank, + pub parent: usize, + pub valid: bool, } impl Taxon { diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs index fb876817..2c1b53a2 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs @@ -1,63 +1,10 @@ -use crate::taxons_uniprots_tables::models::{Rank, Taxon}; -use anyhow::{Context, Result}; -use bit_vec::BitVec; use std::io::BufRead; use std::path::PathBuf; -use std::str::FromStr; - -use crate::utils::files::open_read; - -pub struct TaxonList { - entries: Vec>, -} - -impl TaxonList { - pub fn from_file(pb: &PathBuf) -> Result { - let mut entries = Vec::new(); - let reader = open_read(pb).context("Unable to open input file")?; - - for line in reader.lines() { - let line = line - .with_context(|| format!("Error reading line from input file {}", pb.display()))?; - let spl: Vec<&str> = line.split('\t').collect(); - let id: usize = spl[0] - .parse() - .with_context(|| format!("Unable to parse {} as usize", spl[0]))?; - let parent: usize = spl[3] - .parse() - .with_context(|| format!("Unable to parse {} as usize", spl[3]))?; - let valid = spl[4].trim() == "true"; - - let taxon = Taxon::new( - spl[1].to_string(), - Rank::from_str(spl[2]) - .with_context(|| format!("Unable to parse {} into Rank", spl[2]))?, - parent, - valid, - ); - - while entries.len() <= id { - entries.push(None); - } - - entries[id] = Some(taxon); - } - - Ok(TaxonList { entries }) - } - pub fn get(&self, i: usize) -> &Option { - &self.entries[i] - } - - pub fn is_empty(&self) -> bool { - self.entries.is_empty() - } +use anyhow::{Context, Result}; +use bit_vec::BitVec; - pub fn len(&self) -> usize { - self.entries.len() - } -} +use crate::utils::files::open_read; /// Parse a taxons TSV-file into a vector that can be accessed by id /// The actual content of these Taxons is never used, so we don't try to parse a struct diff --git a/workflows/static_database/version.txt b/workflows/static_database/version.txt index a788ffb2..0b805422 100644 --- a/workflows/static_database/version.txt +++ b/workflows/static_database/version.txt @@ -1 +1 @@ -2023-11-01 +2023-12-05