Skip to content

Commit

Permalink
Add splitkmers command
Browse files Browse the repository at this point in the history
  • Loading branch information
Felix Van der Jeugt committed Jan 7, 2020
1 parent ba32734 commit 9886fe7
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "umgap"
version = "0.3.3"
version = "0.3.4"
authors = ["Felix Van der Jeugt <[email protected]>",
"Stijn Seghers <[email protected]>",
"Niels De Graef <[email protected]>",
Expand Down
12 changes: 12 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ pub enum Opt {
#[structopt(name = "printindex")]
PrintIndex(PrintIndex),

/// Splits each protein sequence in a FASTA format into a list of kmers.
#[structopt(name = "splitkmers")]
SplitKmers(SplitKmers),

/// Write an FST index of stdin on stdout.
#[structopt(name = "buildindex")]
BuildIndex,
Expand Down Expand Up @@ -550,6 +554,14 @@ pub struct PrintIndex {
pub fst_file: PathBuf,
}

/// Splits each taxon id + protein sequence pair in a CSV format into a list of kmers.
#[derive(Debug, StructOpt)]
pub struct SplitKmers {
/// The K in K-mers
#[structopt(short = "k", long = "length", default_value = "9")]
pub length: usize,
}

error_chain! {
errors {
/// Unparseable Frame
Expand Down
1 change: 0 additions & 1 deletion src/dna/translation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,5 +225,4 @@ mod tests {
fn test_number_of_codons() {
assert_eq!(64, CodonIterator::new().count());
}

}
22 changes: 22 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ quick_main!(|| -> Result<()> {
args::Opt::Report(args) => report(args),
args::Opt::BestOf(args) => bestof(args),
args::Opt::PrintIndex(args) => printindex(args),
args::Opt::SplitKmers(args) => splitkmers(args),
args::Opt::BuildIndex => buildindex(),
args::Opt::CountRecords => countrecords(),
}
Expand Down Expand Up @@ -715,6 +716,27 @@ fn printindex(args: args::PrintIndex) -> Result<()> {
Ok(())
}

fn splitkmers(args: args::SplitKmers) -> Result<()> {
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.delimiter(b'\t')
.from_reader(io::stdin());

let mut writer = csv::WriterBuilder::new()
.delimiter(b'\t')
.from_writer(io::stdout());

for record in reader.deserialize() {
let (tid, sequence): (TaxonId, String) = record?;
if sequence.len() < args.length { continue }
for kmer in sequence.as_bytes().windows(args.length) {
writer.serialize((String::from_utf8_lossy(kmer), tid))?;
}
}

Ok(())
}

fn buildindex() -> Result<()> {
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
Expand Down
2 changes: 1 addition & 1 deletion src/rmq/rmq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ impl<T: Ord + Display> RMQ<T> {
.min_by_key(|&(_, val)| val)
.expect("So, it has come to this.")
.0 + i * size()
})
})
.collect()
}

Expand Down
2 changes: 1 addition & 1 deletion src/tree/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ impl<T: Default + Copy> Tree<T> {
set.iter()
.map(|&tid| Tree::create(tid, children, taxons))
.collect()
})
})
.unwrap_or(Vec::new()) }
}

Expand Down

0 comments on commit 9886fe7

Please sign in to comment.