From aa5d358be428479e88564b3fa0c7614de366debc Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Fri, 21 Jun 2024 13:59:11 +0200 Subject: [PATCH] implement prot2pept --- bin/prot2pept.ts | 6 ++++ lib/commands/prot2pept.ts | 73 +++++++++++++++++++++++++++++++++++++++ package.json | 2 ++ 3 files changed, 81 insertions(+) create mode 100755 bin/prot2pept.ts create mode 100644 lib/commands/prot2pept.ts diff --git a/bin/prot2pept.ts b/bin/prot2pept.ts new file mode 100755 index 00000000..d2d593be --- /dev/null +++ b/bin/prot2pept.ts @@ -0,0 +1,6 @@ +#!/usr/bin/env node + +import { Prot2pept } from '../lib/commands/prot2pept.js'; + +const command = new Prot2pept(); +command.run(); diff --git a/lib/commands/prot2pept.ts b/lib/commands/prot2pept.ts new file mode 100644 index 00000000..ac7c8fb7 --- /dev/null +++ b/lib/commands/prot2pept.ts @@ -0,0 +1,73 @@ +import { createInterface } from 'node:readline'; +import { BaseCommand } from './base_command.js'; + +export class Prot2pept extends BaseCommand { + + readonly description = `The prot2pept command splits each protein sequence into a list of peptides according to a given cleavage-pattern. The command expects a list of protein sequences that are passed to standard input. + +The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence. +`; + + constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) { + super(options); + + this.program + .summary("Splits each protein sequence into a list of peptides.") + .description(this.description) + .option("-p, --pattern ", "specify cleavage-pattern (regex) as the pattern after which the next peptide will be cleaved. By default, it will create tryptic peptides.", "([KR])([^P])") + } + + /** + * Performance note: Just as with peptfilter, this implementation can be made faster by using line events instead of + * async iterators. + */ + async run() { + this.parseArguments(); + const pattern = new RegExp(this.program.opts().pattern, "g"); + + let fasta = false; + let protein = []; + + // buffering output makes a big difference in performance + let output = []; + let i = 0; + + for await (const line of createInterface({ input: process.stdin })) { + if (i === 0 && line.startsWith(">")) { + fasta = true; + } + + i++; + + if (fasta) { // if we're in fasta mode, a protein could be split over multiple lines + if (line.startsWith(">")) { // if we encounter a new header, process the previous protein and output the current header + if (protein.length > 0) { + output.push(Prot2pept.splitProtein(protein.join(""), pattern)); + } + output.push(line.trimEnd()); + protein = []; + } else { + protein.push(line.trimEnd()); + } + } else { // if we're not in fasta mode, each line is a protein sequence + output.push(Prot2pept.splitProtein(line.trimEnd(), pattern)); + } + + if (i % 1000 === 0) { + output.push(""); //add a newline at the end of the buffer without additional string copy + process.stdout.write(output.join("\n")); + output = []; + } + } + + if (fasta) { // if in fasta mode, process the last protein + output.push(Prot2pept.splitProtein(protein.join(""), pattern)); + } + output.push(""); + process.stdout.write(output.join("\n")); + } + + static splitProtein(line: string, pattern: RegExp): string { + return line.replaceAll(pattern, "$1\n$2").replaceAll(pattern, "$1\n$2").replaceAll("\n\n", "\n"); + } +} diff --git a/package.json b/package.json index ddaa71bc..f27f3c4c 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "type": "module", "bin": { "peptfilter": "./bin/peptfilter.js", + "prot2pept": "./bin/prot2pept.js", "uniprot": "./bin/uniprot.js" }, "scripts": { @@ -17,6 +18,7 @@ "test": "NODE_OPTIONS='--experimental-vm-modules --no-warnings' yarn run jest", "typecheck": "yarn tsc --skipLibCheck --noEmit", "peptfilter": "yarn run tsx bin/peptfilter.ts", + "prot2pept": "yarn run tsx bin/prot2pept.ts", "uniprot": "yarn run tsx bin/uniprot.ts" }, "dependencies": {