Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rust LCA #32

Merged
merged 17 commits into from
Jan 10, 2024
66 changes: 66 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@ anyhow = "1.0.75"
bit-vec = "0.6.3"
chrono = "0.4.31"
clap = { version = "4.4.6", features = ["derive"] }
regex = "1.10.2"
smartstring = { version = "1.0" }
strum = "0.25.0"
strum_macros = "0.25.3"
uniprot = "0.7.0"
21 changes: 21 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use anyhow::{Context, Result};
use clap::Parser;
use std::path::PathBuf;
use unipept_database::calculate_lcas::taxonomy::Taxonomy;
use unipept_database::taxons_uniprots_tables::utils::now_str;

#[derive(Parser)]
struct Cli {
#[clap(long)]
infile: PathBuf,
}

fn main() -> Result<()> {
let args = Cli::parse();

eprintln!("{}: reading taxonomy", now_str());
let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?;

eprintln!("{}: reading sequences", now_str());
tax.calculate_lcas()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use anyhow::{Context, Result};
use clap::Parser;
use std::path::PathBuf;
use unipept_database::taxons_lineages::taxon_list::TaxonList;

fn main() -> Result<()> {
let args = Cli::parse();

let mut tl = TaxonList::from_dumps(&args.names, &args.nodes)
.context("Failed to parse TaxonList from dumps")?;
tl.invalidate().context("Failed to validate TaxonList")?;
tl.write_taxons(&args.taxons)
.context("Failed to write TaxonList")?;
tl.write_lineages(&args.lineages)
.context("Failed to write lineages")?;

Ok(())
}

#[derive(Parser, Debug)]
struct Cli {
#[clap(short, long)]
names: PathBuf,
#[clap(short, long)]
nodes: PathBuf,
#[clap(short, long)]
taxons: PathBuf,
#[clap(short, long)]
lineages: PathBuf,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod taxonomy;
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
use std::collections::HashMap;
use std::io::BufRead;
use std::path::PathBuf;

use anyhow::{Context, Result};

use crate::taxons_uniprots_tables::utils::now_str;
use crate::utils::files::{open_read, open_sin};

const GENUS: u8 = 18;
const RANKS: u8 = 27;
const SPECIES: u8 = 22;
const NULL_STRING: &str = "\\N";
const SEPARATOR: &str = "\t";

pub struct Taxonomy {
taxonomy: Vec<Vec<i32>>,
}

impl Taxonomy {
pub fn build(infile: &PathBuf) -> Result<Self> {
let mut taxonomy_map: HashMap<i32, Vec<i32>> = HashMap::new();
let reader = open_read(infile).context("Unable to open input file")?;

let mut max = i32::MIN;

for line in reader.lines() {
let line = line.with_context(|| {
format!("Error reading line from input file {}", infile.display())
})?;
let mut elements = line.splitn(28, SEPARATOR).map(parse_int);
let key = elements
.next()
.context("Unable to access key at first index of line")??;

// Note on the collect::<> here: "?" can't be used inside of map() as it is a closure
// Collecting into a Result<Vec<_>> will stop instantly when it receives one Error
// https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect
let lineage = elements.collect::<Result<Vec<i32>>>()?;
taxonomy_map.insert(key, lineage);

// Keep track of highest key
if key > max {
max = key;
}
}

let mut taxonomy = vec![Vec::new(); (max + 1) as usize];

for (key, value) in taxonomy_map {
taxonomy[key as usize] = value;
}

Ok(Taxonomy { taxonomy })
}

pub fn calculate_lcas(&self) -> Result<()> {
let reader = open_sin();

let mut current_sequence = String::new();
let mut taxa: Vec<i32> = Vec::new();

for (i, line) in reader.lines().enumerate() {
if i % 10000000 == 0 && i != 0 {
eprintln!("{}: {}", now_str(), i);
}

let line = line.context("error reading line from stdin")?;

let (sequence, taxon_id) =
line.split_once(SEPARATOR).context("error splitting line")?;
let taxon_id: i32 = taxon_id
.trim_end()
.parse()
.context("error parsing taxon id to int")?;

if current_sequence.is_empty() || current_sequence != sequence {
if !current_sequence.is_empty() {
self.handle_lca(&current_sequence, self.calculate_lca(&taxa));
}

current_sequence = sequence.to_string();
taxa.clear();
}

taxa.push(taxon_id);
}

self.handle_lca(&current_sequence, self.calculate_lca(&taxa));
Ok(())
}

fn calculate_lca(&self, taxa: &[i32]) -> i32 {
let mut lca = 1;

let lineages: Vec<&Vec<i32>> = taxa
.iter()
.map(|x| &self.taxonomy[*x as usize])
.filter(|x| !x.is_empty())
stijndcl marked this conversation as resolved.
Show resolved Hide resolved
.collect();

for rank in 0..RANKS {
let final_rank = rank;
let mut value = -1;

let iterator = lineages
.iter()
.map(|&x| x[final_rank as usize])
.filter(|&x| {
if final_rank == GENUS || final_rank == SPECIES {
x > 0
} else {
x >= 0
}
});

// Check if all elements in the iterator are the same
// This was near-impossible to do with the iterators above,
// so we're using a simplified loop here
stijndcl marked this conversation as resolved.
Show resolved Hide resolved
for item in iterator {
if value == -1 {
value = item;
} else if item != value {
return lca;
}
}

// If we found a new value that matched for all of them, use this as the new best
if value > 0 {
lca = value;
}
}

lca
}

fn handle_lca(&self, sequence: &String, lca: i32) {
println!("{}\t{}", sequence, lca);
}
}

fn parse_int(s: &str) -> Result<i32> {
if s == NULL_STRING {
return Ok(0);
}

s.parse::<i32>()
.with_context(|| format!("Error parsing {} as an integer", s))
}
2 changes: 2 additions & 0 deletions scripts/helper_scripts/unipept-database-rs/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
pub mod calculate_lcas;
pub mod taxons_lineages;
pub mod taxons_uniprots_tables;
pub mod utils;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod taxon_list;
Loading