From 2bc265a41e502c06e91ad9b77e59e4ce5340da57 Mon Sep 17 00:00:00 2001
From: Felix Van der Jeugt <felix.vanderjeugt@posteo.net>
Date: Tue, 7 Jan 2020 16:50:16 +0100
Subject: [PATCH] Add command to join kmers as replacement for
 LineagePeptideTaxon2LCAs

---
 Cargo.toml  |  2 +-
 src/args.rs | 12 ++++++++++++
 src/main.rs | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)
diff --git a/Cargo.toml b/Cargo.toml
index 365a81d..28b5807 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "umgap"
-version = "0.3.4"
+version = "0.3.5"
 authors = ["Felix Van der Jeugt <felix.vanderjeugt@gmail.com>",
            "Stijn Seghers <stijn.seghers@gmail.com>",
            "Niels De Graef <Niels.DeGraef@UGent.be>",
diff --git a/src/args.rs b/src/args.rs
index 93e3b0c..413b27a 100644
--- a/src/args.rs
+++ b/src/args.rs
@@ -191,6 +191,10 @@ pub enum Opt {
 	#[structopt(name = "splitkmers")]
 	SplitKmers(SplitKmers),
 
+	/// Groups a CSV by equal first fields (Kmers) and aggregates the second fields (taxon ids).
+	#[structopt(name = "joinkmers")]
+	JoinKmers(JoinKmers),
+
 	/// Write an FST index of stdin on stdout.
 	#[structopt(name = "buildindex")]
 	BuildIndex,
@@ -565,6 +569,14 @@ pub struct SplitKmers {
 	pub prefix: String,
 }
 
+/// Groups a CSV by equal first fields (Kmers) and aggregates the second fields (taxon ids).
+#[derive(Debug, StructOpt)]
+pub struct JoinKmers {
+	/// The NCBI taxonomy tsv-file
+	#[structopt(parse(from_os_str))]
+	pub taxon_file: PathBuf,
+}
+
 error_chain! {
 	errors {
 		/// Unparseable Frame
diff --git a/src/main.rs b/src/main.rs
index c62ec47..20954ab 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -45,6 +45,7 @@ use umgap::taxon;
 use umgap::errors::{Result, ErrorKind};
 use umgap::taxon::TaxonId;
 use umgap::agg;
+use umgap::agg::Aggregator;
 use umgap::rmq;
 use umgap::tree;
 use umgap::utils;
@@ -72,6 +73,7 @@ quick_main!(|| -> Result<()> {
 		args::Opt::BestOf(args)          => bestof(args),
 		args::Opt::PrintIndex(args)      => printindex(args),
 		args::Opt::SplitKmers(args)      => splitkmers(args),
+		args::Opt::JoinKmers(args)       => joinkmers(args),
 		args::Opt::BuildIndex            => buildindex(),
 		args::Opt::CountRecords          => countrecords(),
 	}
@@ -761,6 +763,57 @@ fn buildindex() -> Result<()> {
 	Ok(())
 }
 
+fn joinkmers(args: args::JoinKmers) -> Result<()> {
+	let mut reader = csv::ReaderBuilder::new()
+	                                    .has_headers(false)
+	                                    .delimiter(b'\t')
+	                                    .from_reader(io::stdin());
+
+	let mut writer = csv::WriterBuilder::new()
+	                                    .delimiter(b'\t')
+	                                    .from_writer(io::stdout());
+
+	// Parsing the Taxa file
+	let taxons = taxon::read_taxa_file(args.taxon_file)?;
+
+	// Parsing the taxons
+	let tree = taxon::TaxonTree::new(&taxons);
+	let by_id = taxon::TaxonList::new(taxons);
+	let snapping = tree.snapping(&by_id, true);
+	let aggregator = tree::lca::LCACalculator::new(tree.root, &by_id);
+
+	let mut emit = |kmer: &str, tids: Vec<(TaxonId, f32)>| {
+		let counts = agg::count(tids.into_iter());
+		if let Ok(aggregate) = aggregator.aggregate(&counts) {
+			writer.serialize((kmer, snapping[aggregate].unwrap()))
+		} else {
+			Ok(())
+		}
+	};
+
+	// Iterate over records and emit groups
+	let mut current_kmer: Option<String> = Option::None;
+	let mut current_tids = vec!();
+	for record in reader.deserialize() {
+		let (kmer, tid): (String, TaxonId) = record?;
+		if let Some(c) = current_kmer {
+			if c != kmer {
+				emit(&c, current_tids)?;
+				current_tids = vec!();
+			}
+		} else {
+			current_tids = vec!();
+		}
+		current_kmer = Some(kmer);
+		current_tids.push((tid, 1.0));
+	}
+	if let Some(c) = current_kmer {
+		emit(&c, current_tids)?;
+	}
+
+	Ok(())
+}
+
 fn countrecords() -> Result<()> {
 	let mut records = 0;
 	let mut sequences = 0;