add prot2tryp2lca subcommand

unipept · Jul 10, 2019 · d224b88 · d224b88
1 parent 99c45b7
commit d224b88
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 1 deletion.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "umgap"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["Felix Van der Jeugt <[email protected]>",
            "Stijn Seghers <[email protected]>",
            "Niels De Graef <[email protected]>",

diff --git a/src/args.rs b/src/args.rs
@@ -129,6 +129,11 @@ pub enum Opt {
 	#[structopt(name = "prot2kmer2lca")]
 	ProtToKmerToLca(ProtToKmerToLca),
 
+	/// Reads all the records in a specified FASTA file and queries the
+	/// tryptic peptides in an FST for the LCA's.
+	#[structopt(name = "prot2tryp2lca")]
+	ProtToTrypToLca(ProtToTrypToLca),
+
 	/// Aggregates taxa to a single taxon.
 	#[structopt(name = "taxa2agg")]
 	TaxaToAgg(TaxaToAgg),
@@ -289,6 +294,54 @@ pub struct ProtToKmerToLca {
 	pub chunk_size: usize,
 }
 
+/// Reads all the records in a specified FASTA file and queries the
+/// tryptic peptides in an FST for the LCA's.
+#[derive(Debug, StructOpt)]
+pub struct ProtToTrypToLca {
+	/// Map unknown sequences to 0 instead of ignoring them
+	#[structopt(short = "o", long = "one-on-one")]
+	pub one_on_one: bool,
+
+	/// An FST to query
+	#[structopt(parse(from_os_str))]
+	pub fst_file: PathBuf,
+
+	/// Load FST in memory instead of mmap'ing the file contents. This makes
+	/// querying significantly faster, but requires some time to load the FST
+	/// into memory.
+	#[structopt(short = "m", long = "in-memory")]
+	pub fst_in_memory: bool,
+
+	/// How much reads to group together in one chunk, bigger chunks decrease
+	/// the overhead caused by multithreading. Because the output order is not
+	/// necessarily the same as the input order, having a chunk size which is
+	/// a multiple of 12 (all 6 translations multiplied by the two paired-end
+	/// reads) will keep sequences of the same reads together.
+	#[structopt(short = "c", long = "chunksize", default_value = "240")]
+	pub chunk_size: usize,
+
+	/// The cleavage-pattern (regex), i.e. the pattern after which
+	/// the next peptide will be cleaved for tryptic peptides)
+	#[structopt(short = "p", long = "pattern", default_value = "([KR])([^P])")]
+	pub pattern: String,
+
+	/// Minimum length of tryptic peptides to be mapped
+	#[structopt(short = "l", long = "minlen", default_value = "5")]
+	pub min_length: usize,
+
+	/// Maximum length of tryptic peptides to be mapped
+	#[structopt(short = "L", long = "maxlen", default_value = "50")]
+	pub max_length: usize,
+
+	/// The letters that a tryptic peptide must contain
+	#[structopt(short = "c", long = "contains", default_value = "")]
+	pub contains: String,
+
+	/// The letters that a tryptic peptide mustn't contain
+	#[structopt(short = "l", long = "lacks", default_value = "")]
+	pub lacks: String,
+}
+
 /// Aggregates taxa to a single taxon.
 #[derive(Debug, StructOpt)]
 pub struct TaxaToAgg {

diff --git a/src/main.rs b/src/main.rs
@@ -57,6 +57,7 @@ quick_main!(|| -> Result<()> {
 		args::Opt::Translate(args)       => translate(args),
 		args::Opt::PeptToLca(args)       => pept2lca(args),
 		args::Opt::ProtToKmerToLca(args) => prot2kmer2lca(args),
+		args::Opt::ProtToTrypToLca(args) => prot2tryp2lca(args),
 		args::Opt::TaxaToAgg(args)       => taxa2agg(args),
 		args::Opt::ProtToPept(args)      => prot2pept(args),
 		args::Opt::ProtToKmer(args)      => prot2kmer(args),
@@ -221,6 +222,56 @@ fn prot2kmer2lca(args: args::ProtToKmerToLca) -> Result<()> {
 	}
 }
 
+fn prot2tryp2lca(args: args::ProtToTrypToLca) -> Result<()> {
+	let fst = if args.fst_in_memory {
+		let bytes = fs::read(&args.fst_file)?;
+		fst::Map::from_bytes(bytes)?
+	} else {
+		unsafe { fst::Map::from_path(&args.fst_file) }?
+	};
+	let default = if args.one_on_one { Some(0) } else { None };
+	let pattern = regex::Regex::new(&args.pattern)?;
+	let contains = args.contains.chars().collect::<HashSet<char>>();
+	let lacks = args.lacks.chars().collect::<HashSet<char>>();
+
+	fasta::Reader::new(io::stdin(), false)
+		.records()
+		.chunked(args.chunk_size)
+		.par_bridge()
+		.map(|chunk| {
+			let chunk = chunk?;
+			let mut chunk_output = String::new();
+			for read in chunk {
+				chunk_output.push_str(&format!(">{}\n", read.header));
+				for seq in read.sequence {
+					// We will run the regex replacement twice, since a letter can be
+					// matched twice (i.e. once after and once before the split).
+					let first_run = pattern.replace_all(&seq, "$1\n$2");
+					for peptide in pattern.replace_all(&first_run, "$1\n$2").replace("*", "\n").lines()
+							.filter(|x| !x.is_empty())
+							.filter(|seq| {
+								let length = seq.len();
+								length >= args.min_length && length <= args.max_length
+							})
+							.filter(|seq| (contains.is_empty() && lacks.is_empty()) || {
+								let set = seq.chars().collect::<HashSet<char>>();
+								contains.intersection(&set).count() == contains.len()
+								    && lacks.intersection(&set).count() == 0
+							}) {
+						if let Some(lca) = fst.get(&peptide).map(Some).unwrap_or(default) {
+							chunk_output.push_str(&format!("{}\n", lca));
+						}
+					}
+				}
+			}
+			// TODO: make this the result of the map
+			// and print using a Writer
+			print!("{}", chunk_output);
+			Ok(())
+		})
+		.collect()
+}
+
 fn taxa2agg(args: args::TaxaToAgg) -> Result<()> {
 	// Parsing the Taxa file
 	let taxons = taxon::read_taxa_file(args.taxon_file)?;