diff --git a/Cargo.toml b/Cargo.toml index 5c30ed0..d4364e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "umgap" -version = "0.3.0" +version = "0.3.1" authors = ["Felix Van der Jeugt ", "Stijn Seghers ", "Niels De Graef ", diff --git a/src/args.rs b/src/args.rs index 633c356..a0f7902 100644 --- a/src/args.rs +++ b/src/args.rs @@ -129,6 +129,11 @@ pub enum Opt { #[structopt(name = "prot2kmer2lca")] ProtToKmerToLca(ProtToKmerToLca), + /// Reads all the records in a specified FASTA file and queries the + /// tryptic peptides in an FST for the LCA's. + #[structopt(name = "prot2tryp2lca")] + ProtToTrypToLca(ProtToTrypToLca), + /// Aggregates taxa to a single taxon. #[structopt(name = "taxa2agg")] TaxaToAgg(TaxaToAgg), @@ -289,6 +294,54 @@ pub struct ProtToKmerToLca { pub chunk_size: usize, } +/// Reads all the records in a specified FASTA file and queries the +/// tryptic peptides in an FST for the LCA's. +#[derive(Debug, StructOpt)] +pub struct ProtToTrypToLca { + /// Map unknown sequences to 0 instead of ignoring them + #[structopt(short = "o", long = "one-on-one")] + pub one_on_one: bool, + + /// An FST to query + #[structopt(parse(from_os_str))] + pub fst_file: PathBuf, + + /// Load FST in memory instead of mmap'ing the file contents. This makes + /// querying significantly faster, but requires some time to load the FST + /// into memory. + #[structopt(short = "m", long = "in-memory")] + pub fst_in_memory: bool, + + /// How much reads to group together in one chunk, bigger chunks decrease + /// the overhead caused by multithreading. Because the output order is not + /// necessarily the same as the input order, having a chunk size which is + /// a multiple of 12 (all 6 translations multiplied by the two paired-end + /// reads) will keep sequences of the same reads together. + #[structopt(short = "c", long = "chunksize", default_value = "240")] + pub chunk_size: usize, + + /// The cleavage-pattern (regex), i.e. the pattern after which + /// the next peptide will be cleaved for tryptic peptides) + #[structopt(short = "p", long = "pattern", default_value = "([KR])([^P])")] + pub pattern: String, + + /// Minimum length of tryptic peptides to be mapped + #[structopt(short = "l", long = "minlen", default_value = "5")] + pub min_length: usize, + + /// Maximum length of tryptic peptides to be mapped + #[structopt(short = "L", long = "maxlen", default_value = "50")] + pub max_length: usize, + + /// The letters that a tryptic peptide must contain + #[structopt(short = "c", long = "contains", default_value = "")] + pub contains: String, + + /// The letters that a tryptic peptide mustn't contain + #[structopt(short = "l", long = "lacks", default_value = "")] + pub lacks: String, +} + /// Aggregates taxa to a single taxon. #[derive(Debug, StructOpt)] pub struct TaxaToAgg { diff --git a/src/main.rs b/src/main.rs index ffc9748..db2fe6b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -57,6 +57,7 @@ quick_main!(|| -> Result<()> { args::Opt::Translate(args) => translate(args), args::Opt::PeptToLca(args) => pept2lca(args), args::Opt::ProtToKmerToLca(args) => prot2kmer2lca(args), + args::Opt::ProtToTrypToLca(args) => prot2tryp2lca(args), args::Opt::TaxaToAgg(args) => taxa2agg(args), args::Opt::ProtToPept(args) => prot2pept(args), args::Opt::ProtToKmer(args) => prot2kmer(args), @@ -221,6 +222,56 @@ fn prot2kmer2lca(args: args::ProtToKmerToLca) -> Result<()> { } } +fn prot2tryp2lca(args: args::ProtToTrypToLca) -> Result<()> { + let fst = if args.fst_in_memory { + let bytes = fs::read(&args.fst_file)?; + fst::Map::from_bytes(bytes)? + } else { + unsafe { fst::Map::from_path(&args.fst_file) }? + }; + let default = if args.one_on_one { Some(0) } else { None }; + let pattern = regex::Regex::new(&args.pattern)?; + let contains = args.contains.chars().collect::>(); + let lacks = args.lacks.chars().collect::>(); + + fasta::Reader::new(io::stdin(), false) + .records() + .chunked(args.chunk_size) + .par_bridge() + .map(|chunk| { + let chunk = chunk?; + let mut chunk_output = String::new(); + for read in chunk { + chunk_output.push_str(&format!(">{}\n", read.header)); + for seq in read.sequence { + // We will run the regex replacement twice, since a letter can be + // matched twice (i.e. once after and once before the split). + let first_run = pattern.replace_all(&seq, "$1\n$2"); + for peptide in pattern.replace_all(&first_run, "$1\n$2").replace("*", "\n").lines() + .filter(|x| !x.is_empty()) + .filter(|seq| { + let length = seq.len(); + length >= args.min_length && length <= args.max_length + }) + .filter(|seq| (contains.is_empty() && lacks.is_empty()) || { + let set = seq.chars().collect::>(); + contains.intersection(&set).count() == contains.len() + && lacks.intersection(&set).count() == 0 + }) { + if let Some(lca) = fst.get(&peptide).map(Some).unwrap_or(default) { + chunk_output.push_str(&format!("{}\n", lca)); + } + } + } + } + // TODO: make this the result of the map + // and print using a Writer + print!("{}", chunk_output); + Ok(()) + }) + .collect() +} + fn taxa2agg(args: args::TaxaToAgg) -> Result<()> { // Parsing the Taxa file let taxons = taxon::read_taxa_file(args.taxon_file)?;