Skip to content

Commit

Permalink
Add command to join kmers as replacement for LineagePeptideTaxon2LCAs
Browse files Browse the repository at this point in the history
  • Loading branch information
Felix Van der Jeugt committed Jan 7, 2020
1 parent 64b23b1 commit 2bc265a
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "umgap"
version = "0.3.4"
version = "0.3.5"
authors = ["Felix Van der Jeugt <[email protected]>",
"Stijn Seghers <[email protected]>",
"Niels De Graef <[email protected]>",
Expand Down
12 changes: 12 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ pub enum Opt {
#[structopt(name = "splitkmers")]
SplitKmers(SplitKmers),

/// Groups a CSV by equal first fields (Kmers) and aggregates the second fields (taxon ids).
#[structopt(name = "joinkmers")]
JoinKmers(JoinKmers),

/// Write an FST index of stdin on stdout.
#[structopt(name = "buildindex")]
BuildIndex,
Expand Down Expand Up @@ -565,6 +569,14 @@ pub struct SplitKmers {
pub prefix: String,
}

/// Groups a CSV by equal first fields (Kmers) and aggregates the second fields (taxon ids).
#[derive(Debug, StructOpt)]
pub struct JoinKmers {
/// The NCBI taxonomy tsv-file
#[structopt(parse(from_os_str))]
pub taxon_file: PathBuf,
}

error_chain! {
errors {
/// Unparseable Frame
Expand Down
53 changes: 53 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ use umgap::taxon;
use umgap::errors::{Result, ErrorKind};
use umgap::taxon::TaxonId;
use umgap::agg;
use umgap::agg::Aggregator;
use umgap::rmq;
use umgap::tree;
use umgap::utils;
Expand Down Expand Up @@ -72,6 +73,7 @@ quick_main!(|| -> Result<()> {
args::Opt::BestOf(args) => bestof(args),
args::Opt::PrintIndex(args) => printindex(args),
args::Opt::SplitKmers(args) => splitkmers(args),
args::Opt::JoinKmers(args) => joinkmers(args),
args::Opt::BuildIndex => buildindex(),
args::Opt::CountRecords => countrecords(),
}
Expand Down Expand Up @@ -761,6 +763,57 @@ fn buildindex() -> Result<()> {
Ok(())
}

fn joinkmers(args: args::JoinKmers) -> Result<()> {
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.delimiter(b'\t')
.from_reader(io::stdin());

let mut writer = csv::WriterBuilder::new()
.delimiter(b'\t')
.from_writer(io::stdout());

// Parsing the Taxa file
let taxons = taxon::read_taxa_file(args.taxon_file)?;

// Parsing the taxons
let tree = taxon::TaxonTree::new(&taxons);
let by_id = taxon::TaxonList::new(taxons);
let snapping = tree.snapping(&by_id, true);
let aggregator = tree::lca::LCACalculator::new(tree.root, &by_id);

let mut emit = |kmer: &str, tids: Vec<(TaxonId, f32)>| {
let counts = agg::count(tids.into_iter());
if let Ok(aggregate) = aggregator.aggregate(&counts) {
writer.serialize((kmer, snapping[aggregate].unwrap()))
} else {
Ok(())
}
};

// Iterate over records and emit groups
let mut current_kmer: Option<String> = Option::None;
let mut current_tids = vec!();
for record in reader.deserialize() {
let (kmer, tid): (String, TaxonId) = record?;
if let Some(c) = current_kmer {
if c != kmer {
emit(&c, current_tids)?;
current_tids = vec!();
}
} else {
current_tids = vec!();
}
current_kmer = Some(kmer);
current_tids.push((tid, 1.0));
}
if let Some(c) = current_kmer {
emit(&c, current_tids)?;
}

Ok(())
}

fn countrecords() -> Result<()> {
let mut records = 0;
let mut sequences = 0;
Expand Down

0 comments on commit 2bc265a

Please sign in to comment.