From 2bc265a41e502c06e91ad9b77e59e4ce5340da57 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Tue, 7 Jan 2020 16:50:16 +0100 Subject: [PATCH] Add command to join kmers as replacement for LineagePeptideTaxon2LCAs --- Cargo.toml | 2 +- src/args.rs | 12 ++++++++++++ src/main.rs | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 365a81d..28b5807 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "umgap" -version = "0.3.4" +version = "0.3.5" authors = ["Felix Van der Jeugt ", "Stijn Seghers ", "Niels De Graef ", diff --git a/src/args.rs b/src/args.rs index 93e3b0c..413b27a 100644 --- a/src/args.rs +++ b/src/args.rs @@ -191,6 +191,10 @@ pub enum Opt { #[structopt(name = "splitkmers")] SplitKmers(SplitKmers), + /// Groups a CSV by equal first fields (Kmers) and aggregates the second fields (taxon ids). + #[structopt(name = "joinkmers")] + JoinKmers(JoinKmers), + /// Write an FST index of stdin on stdout. #[structopt(name = "buildindex")] BuildIndex, @@ -565,6 +569,14 @@ pub struct SplitKmers { pub prefix: String, } +/// Groups a CSV by equal first fields (Kmers) and aggregates the second fields (taxon ids). +#[derive(Debug, StructOpt)] +pub struct JoinKmers { + /// The NCBI taxonomy tsv-file + #[structopt(parse(from_os_str))] + pub taxon_file: PathBuf, +} + error_chain! { errors { /// Unparseable Frame diff --git a/src/main.rs b/src/main.rs index c62ec47..20954ab 100644 --- a/src/main.rs +++ b/src/main.rs @@ -45,6 +45,7 @@ use umgap::taxon; use umgap::errors::{Result, ErrorKind}; use umgap::taxon::TaxonId; use umgap::agg; +use umgap::agg::Aggregator; use umgap::rmq; use umgap::tree; use umgap::utils; @@ -72,6 +73,7 @@ quick_main!(|| -> Result<()> { args::Opt::BestOf(args) => bestof(args), args::Opt::PrintIndex(args) => printindex(args), args::Opt::SplitKmers(args) => splitkmers(args), + args::Opt::JoinKmers(args) => joinkmers(args), args::Opt::BuildIndex => buildindex(), args::Opt::CountRecords => countrecords(), } @@ -761,6 +763,57 @@ fn buildindex() -> Result<()> { Ok(()) } +fn joinkmers(args: args::JoinKmers) -> Result<()> { + let mut reader = csv::ReaderBuilder::new() + .has_headers(false) + .delimiter(b'\t') + .from_reader(io::stdin()); + + let mut writer = csv::WriterBuilder::new() + .delimiter(b'\t') + .from_writer(io::stdout()); + + // Parsing the Taxa file + let taxons = taxon::read_taxa_file(args.taxon_file)?; + + // Parsing the taxons + let tree = taxon::TaxonTree::new(&taxons); + let by_id = taxon::TaxonList::new(taxons); + let snapping = tree.snapping(&by_id, true); + let aggregator = tree::lca::LCACalculator::new(tree.root, &by_id); + + let mut emit = |kmer: &str, tids: Vec<(TaxonId, f32)>| { + let counts = agg::count(tids.into_iter()); + if let Ok(aggregate) = aggregator.aggregate(&counts) { + writer.serialize((kmer, snapping[aggregate].unwrap())) + } else { + Ok(()) + } + }; + + // Iterate over records and emit groups + let mut current_kmer: Option = Option::None; + let mut current_tids = vec!(); + for record in reader.deserialize() { + let (kmer, tid): (String, TaxonId) = record?; + if let Some(c) = current_kmer { + if c != kmer { + emit(&c, current_tids)?; + current_tids = vec!(); + } + } else { + current_tids = vec!(); + } + current_kmer = Some(kmer); + current_tids.push((tid, 1.0)); + } + if let Some(c) = current_kmer { + emit(&c, current_tids)?; + } + + Ok(()) +} + fn countrecords() -> Result<()> { let mut records = 0; let mut sequences = 0;