-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/unipept/make-database
- Loading branch information
Showing
13 changed files
with
574 additions
and
157 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
21 changes: 21 additions & 0 deletions
21
scripts/helper_scripts/unipept-database-rs/src/bin/lcas.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
use anyhow::{Context, Result}; | ||
use clap::Parser; | ||
use std::path::PathBuf; | ||
use unipept_database::calculate_lcas::taxonomy::Taxonomy; | ||
use unipept_database::taxons_uniprots_tables::utils::now_str; | ||
|
||
#[derive(Parser)] | ||
struct Cli { | ||
#[clap(long)] | ||
infile: PathBuf, | ||
} | ||
|
||
fn main() -> Result<()> { | ||
let args = Cli::parse(); | ||
|
||
eprintln!("{}: reading taxonomy", now_str()); | ||
let tax = Taxonomy::build(&args.infile).context("Unable to build taxonomy")?; | ||
|
||
eprintln!("{}: reading sequences", now_str()); | ||
tax.calculate_lcas() | ||
} |
30 changes: 30 additions & 0 deletions
30
scripts/helper_scripts/unipept-database-rs/src/bin/taxons-lineages.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
use anyhow::{Context, Result}; | ||
use clap::Parser; | ||
use std::path::PathBuf; | ||
use unipept_database::taxons_lineages::taxon_list::TaxonList; | ||
|
||
fn main() -> Result<()> { | ||
let args = Cli::parse(); | ||
|
||
let mut tl = TaxonList::from_dumps(&args.names, &args.nodes) | ||
.context("Failed to parse TaxonList from dumps")?; | ||
tl.invalidate().context("Failed to validate TaxonList")?; | ||
tl.write_taxons(&args.taxons) | ||
.context("Failed to write TaxonList")?; | ||
tl.write_lineages(&args.lineages) | ||
.context("Failed to write lineages")?; | ||
|
||
Ok(()) | ||
} | ||
|
||
#[derive(Parser, Debug)] | ||
struct Cli { | ||
#[clap(short, long)] | ||
names: PathBuf, | ||
#[clap(short, long)] | ||
nodes: PathBuf, | ||
#[clap(short, long)] | ||
taxons: PathBuf, | ||
#[clap(short, long)] | ||
lineages: PathBuf, | ||
} |
1 change: 1 addition & 0 deletions
1
scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/mod.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pub mod taxonomy; |
149 changes: 149 additions & 0 deletions
149
scripts/helper_scripts/unipept-database-rs/src/calculate_lcas/taxonomy.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
use std::collections::HashMap; | ||
use std::io::BufRead; | ||
use std::path::PathBuf; | ||
|
||
use anyhow::{Context, Result}; | ||
|
||
use crate::taxons_uniprots_tables::utils::now_str; | ||
use crate::utils::files::{open_read, open_sin}; | ||
|
||
const GENUS: u8 = 18; | ||
const RANKS: u8 = 27; | ||
const SPECIES: u8 = 22; | ||
const NULL_STRING: &str = "\\N"; | ||
const SEPARATOR: &str = "\t"; | ||
|
||
pub struct Taxonomy { | ||
taxonomy: Vec<Vec<i32>>, | ||
} | ||
|
||
impl Taxonomy { | ||
pub fn build(infile: &PathBuf) -> Result<Self> { | ||
let mut taxonomy_map: HashMap<i32, Vec<i32>> = HashMap::new(); | ||
let reader = open_read(infile).context("Unable to open input file")?; | ||
|
||
let mut max = i32::MIN; | ||
|
||
for line in reader.lines() { | ||
let line = line.with_context(|| { | ||
format!("Error reading line from input file {}", infile.display()) | ||
})?; | ||
let mut elements = line.splitn(28, SEPARATOR).map(parse_int); | ||
let key = elements | ||
.next() | ||
.context("Unable to access key at first index of line")??; | ||
|
||
// Note on the collect::<> here: "?" can't be used inside of map() as it is a closure | ||
// Collecting into a Result<Vec<_>> will stop instantly when it receives one Error | ||
// https://doc.rust-lang.org/rust-by-example/error/iter_result.html#fail-the-entire-operation-with-collect | ||
let lineage = elements.collect::<Result<Vec<i32>>>()?; | ||
taxonomy_map.insert(key, lineage); | ||
|
||
// Keep track of highest key | ||
if key > max { | ||
max = key; | ||
} | ||
} | ||
|
||
let mut taxonomy = vec![Vec::new(); (max + 1) as usize]; | ||
|
||
for (key, value) in taxonomy_map { | ||
taxonomy[key as usize] = value; | ||
} | ||
|
||
Ok(Taxonomy { taxonomy }) | ||
} | ||
|
||
pub fn calculate_lcas(&self) -> Result<()> { | ||
let reader = open_sin(); | ||
|
||
let mut current_sequence = String::new(); | ||
let mut taxa: Vec<i32> = Vec::new(); | ||
|
||
for (i, line) in reader.lines().enumerate() { | ||
if i % 10000000 == 0 && i != 0 { | ||
eprintln!("{}: {}", now_str(), i); | ||
} | ||
|
||
let line = line.context("error reading line from stdin")?; | ||
|
||
let (sequence, taxon_id) = | ||
line.split_once(SEPARATOR).context("error splitting line")?; | ||
let taxon_id: i32 = taxon_id | ||
.trim_end() | ||
.parse() | ||
.context("error parsing taxon id to int")?; | ||
|
||
if current_sequence.is_empty() || current_sequence != sequence { | ||
if !current_sequence.is_empty() { | ||
self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); | ||
} | ||
|
||
current_sequence = sequence.to_string(); | ||
taxa.clear(); | ||
} | ||
|
||
taxa.push(taxon_id); | ||
} | ||
|
||
self.handle_lca(¤t_sequence, self.calculate_lca(&taxa)); | ||
Ok(()) | ||
} | ||
|
||
fn calculate_lca(&self, taxa: &[i32]) -> i32 { | ||
let mut lca = 1; | ||
|
||
let lineages: Vec<&Vec<i32>> = taxa | ||
.iter() | ||
.map(|x| &self.taxonomy[*x as usize]) | ||
.filter(|x| !x.is_empty()) | ||
.collect(); | ||
|
||
for rank in 0..RANKS { | ||
let final_rank = rank; | ||
let mut value = -1; | ||
|
||
let iterator = lineages | ||
.iter() | ||
.map(|&x| x[final_rank as usize]) | ||
.filter(|&x| { | ||
if final_rank == GENUS || final_rank == SPECIES { | ||
x > 0 | ||
} else { | ||
x >= 0 | ||
} | ||
}); | ||
|
||
// Check if all elements in the iterator are the same | ||
// This was near-impossible to do with the iterators above, | ||
// so we're using a simplified loop here | ||
for item in iterator { | ||
if value == -1 { | ||
value = item; | ||
} else if item != value { | ||
return lca; | ||
} | ||
} | ||
|
||
// If we found a new value that matched for all of them, use this as the new best | ||
if value > 0 { | ||
lca = value; | ||
} | ||
} | ||
|
||
lca | ||
} | ||
|
||
fn handle_lca(&self, sequence: &String, lca: i32) { | ||
println!("{}\t{}", sequence, lca); | ||
} | ||
} | ||
|
||
fn parse_int(s: &str) -> Result<i32> { | ||
if s == NULL_STRING { | ||
return Ok(0); | ||
} | ||
|
||
s.parse::<i32>() | ||
.with_context(|| format!("Error parsing {} as an integer", s)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
pub mod calculate_lcas; | ||
pub mod taxons_lineages; | ||
pub mod taxons_uniprots_tables; | ||
pub mod utils; |
1 change: 1 addition & 0 deletions
1
scripts/helper_scripts/unipept-database-rs/src/taxons_lineages/mod.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pub mod taxon_list; |
Oops, something went wrong.