Skip to content

Commit

Permalink
add prot2tryp2lca subcommand
Browse files Browse the repository at this point in the history
  • Loading branch information
Felix Van der Jeugt committed Jul 10, 2019
1 parent 99c45b7 commit d224b88
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "umgap"
version = "0.3.0"
version = "0.3.1"
authors = ["Felix Van der Jeugt <[email protected]>",
"Stijn Seghers <[email protected]>",
"Niels De Graef <[email protected]>",
Expand Down
53 changes: 53 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ pub enum Opt {
#[structopt(name = "prot2kmer2lca")]
ProtToKmerToLca(ProtToKmerToLca),

/// Reads all the records in a specified FASTA file and queries the
/// tryptic peptides in an FST for the LCA's.
#[structopt(name = "prot2tryp2lca")]
ProtToTrypToLca(ProtToTrypToLca),

/// Aggregates taxa to a single taxon.
#[structopt(name = "taxa2agg")]
TaxaToAgg(TaxaToAgg),
Expand Down Expand Up @@ -289,6 +294,54 @@ pub struct ProtToKmerToLca {
pub chunk_size: usize,
}

/// Reads all the records in a specified FASTA file and queries the
/// tryptic peptides in an FST for the LCA's.
#[derive(Debug, StructOpt)]
pub struct ProtToTrypToLca {
/// Map unknown sequences to 0 instead of ignoring them
#[structopt(short = "o", long = "one-on-one")]
pub one_on_one: bool,

/// An FST to query
#[structopt(parse(from_os_str))]
pub fst_file: PathBuf,

/// Load FST in memory instead of mmap'ing the file contents. This makes
/// querying significantly faster, but requires some time to load the FST
/// into memory.
#[structopt(short = "m", long = "in-memory")]
pub fst_in_memory: bool,

/// How much reads to group together in one chunk, bigger chunks decrease
/// the overhead caused by multithreading. Because the output order is not
/// necessarily the same as the input order, having a chunk size which is
/// a multiple of 12 (all 6 translations multiplied by the two paired-end
/// reads) will keep sequences of the same reads together.
#[structopt(short = "c", long = "chunksize", default_value = "240")]
pub chunk_size: usize,

/// The cleavage-pattern (regex), i.e. the pattern after which
/// the next peptide will be cleaved for tryptic peptides)
#[structopt(short = "p", long = "pattern", default_value = "([KR])([^P])")]
pub pattern: String,

/// Minimum length of tryptic peptides to be mapped
#[structopt(short = "l", long = "minlen", default_value = "5")]
pub min_length: usize,

/// Maximum length of tryptic peptides to be mapped
#[structopt(short = "L", long = "maxlen", default_value = "50")]
pub max_length: usize,

/// The letters that a tryptic peptide must contain
#[structopt(short = "c", long = "contains", default_value = "")]
pub contains: String,

/// The letters that a tryptic peptide mustn't contain
#[structopt(short = "l", long = "lacks", default_value = "")]
pub lacks: String,
}

/// Aggregates taxa to a single taxon.
#[derive(Debug, StructOpt)]
pub struct TaxaToAgg {
Expand Down
51 changes: 51 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ quick_main!(|| -> Result<()> {
args::Opt::Translate(args) => translate(args),
args::Opt::PeptToLca(args) => pept2lca(args),
args::Opt::ProtToKmerToLca(args) => prot2kmer2lca(args),
args::Opt::ProtToTrypToLca(args) => prot2tryp2lca(args),
args::Opt::TaxaToAgg(args) => taxa2agg(args),
args::Opt::ProtToPept(args) => prot2pept(args),
args::Opt::ProtToKmer(args) => prot2kmer(args),
Expand Down Expand Up @@ -221,6 +222,56 @@ fn prot2kmer2lca(args: args::ProtToKmerToLca) -> Result<()> {
}
}

fn prot2tryp2lca(args: args::ProtToTrypToLca) -> Result<()> {
let fst = if args.fst_in_memory {
let bytes = fs::read(&args.fst_file)?;
fst::Map::from_bytes(bytes)?
} else {
unsafe { fst::Map::from_path(&args.fst_file) }?
};
let default = if args.one_on_one { Some(0) } else { None };
let pattern = regex::Regex::new(&args.pattern)?;
let contains = args.contains.chars().collect::<HashSet<char>>();
let lacks = args.lacks.chars().collect::<HashSet<char>>();

fasta::Reader::new(io::stdin(), false)
.records()
.chunked(args.chunk_size)
.par_bridge()
.map(|chunk| {
let chunk = chunk?;
let mut chunk_output = String::new();
for read in chunk {
chunk_output.push_str(&format!(">{}\n", read.header));
for seq in read.sequence {
// We will run the regex replacement twice, since a letter can be
// matched twice (i.e. once after and once before the split).
let first_run = pattern.replace_all(&seq, "$1\n$2");
for peptide in pattern.replace_all(&first_run, "$1\n$2").replace("*", "\n").lines()
.filter(|x| !x.is_empty())
.filter(|seq| {
let length = seq.len();
length >= args.min_length && length <= args.max_length
})
.filter(|seq| (contains.is_empty() && lacks.is_empty()) || {
let set = seq.chars().collect::<HashSet<char>>();
contains.intersection(&set).count() == contains.len()
&& lacks.intersection(&set).count() == 0
}) {
if let Some(lca) = fst.get(&peptide).map(Some).unwrap_or(default) {
chunk_output.push_str(&format!("{}\n", lca));
}
}
}
}
// TODO: make this the result of the map
// and print using a Writer
print!("{}", chunk_output);
Ok(())
})
.collect()
}

fn taxa2agg(args: args::TaxaToAgg) -> Result<()> {
// Parsing the Taxa file
let taxons = taxon::read_taxa_file(args.taxon_file)?;
Expand Down

0 comments on commit d224b88

Please sign in to comment.