From aa5d358be428479e88564b3fa0c7614de366debc Mon Sep 17 00:00:00 2001
From: Bart Mesuere <Bart.Mesuere@UGent.be>
Date: Fri, 21 Jun 2024 13:59:11 +0200
Subject: [PATCH] implement prot2pept

---
 bin/prot2pept.ts          |  6 ++++
 lib/commands/prot2pept.ts | 73 +++++++++++++++++++++++++++++++++++++++
 package.json              |  2 ++
 3 files changed, 81 insertions(+)
 create mode 100755 bin/prot2pept.ts
 create mode 100644 lib/commands/prot2pept.ts
diff --git a/bin/prot2pept.ts b/bin/prot2pept.ts
new file mode 100755
index 00000000..d2d593be
--- /dev/null
+++ b/bin/prot2pept.ts
@@ -0,0 +1,6 @@
+#!/usr/bin/env node
+
+import { Prot2pept } from '../lib/commands/prot2pept.js';
+
+const command = new Prot2pept();
+command.run();
diff --git a/lib/commands/prot2pept.ts b/lib/commands/prot2pept.ts
new file mode 100644
index 00000000..ac7c8fb7
--- /dev/null
+++ b/lib/commands/prot2pept.ts
@@ -0,0 +1,73 @@
+import { createInterface } from 'node:readline';
+import { BaseCommand } from './base_command.js';
+
+export class Prot2pept extends BaseCommand {
+
+  readonly description = `The prot2pept command splits each protein sequence into a list of peptides according to a given cleavage-pattern. The command expects a list of protein sequences that are passed to standard input.
+
+The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence.
+`;
+
+  constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) {
+    super(options);
+
+    this.program
+      .summary("Splits each protein sequence into a list of peptides.")
+      .description(this.description)
+      .option("-p, --pattern <regex>", "specify cleavage-pattern (regex) as the pattern after which the next peptide will be cleaved. By default, it will create tryptic peptides.", "([KR])([^P])")
+  }
+
+  /**
+   * Performance note: Just as with peptfilter, this implementation can be made faster by using line events instead of
+   * async iterators.
+   */
+  async run() {
+    this.parseArguments();
+    const pattern = new RegExp(this.program.opts().pattern, "g");
+
+    let fasta = false;
+    let protein = [];
+
+    // buffering output makes a big difference in performance
+    let output = [];
+    let i = 0;
+
+    for await (const line of createInterface({ input: process.stdin })) {
+      if (i === 0 && line.startsWith(">")) {
+        fasta = true;
+      }
+
+      i++;
+
+      if (fasta) { // if we're in fasta mode, a protein could be split over multiple lines
+        if (line.startsWith(">")) { // if we encounter a new header, process the previous protein and output the current header
+          if (protein.length > 0) {
+            output.push(Prot2pept.splitProtein(protein.join(""), pattern));
+          }
+          output.push(line.trimEnd());
+          protein = [];
+        } else {
+          protein.push(line.trimEnd());
+        }
+      } else { // if we're not in fasta mode, each line is a protein sequence
+        output.push(Prot2pept.splitProtein(line.trimEnd(), pattern));
+      }
+
+      if (i % 1000 === 0) {
+        output.push(""); //add a newline at the end of the buffer without additional string copy
+        process.stdout.write(output.join("\n"));
+        output = [];
+      }
+    }
+
+    if (fasta) { // if in fasta mode, process the last protein
+      output.push(Prot2pept.splitProtein(protein.join(""), pattern));
+    }
+    output.push("");
+    process.stdout.write(output.join("\n"));
+  }
+
+  static splitProtein(line: string, pattern: RegExp): string {
+    return line.replaceAll(pattern, "$1\n$2").replaceAll(pattern, "$1\n$2").replaceAll("\n\n", "\n");
+  }
+}
diff --git a/package.json b/package.json
index ddaa71bc..f27f3c4c 100644
--- a/package.json
+++ b/package.json
@@ -9,6 +9,7 @@
   "type": "module",
   "bin": {
     "peptfilter": "./bin/peptfilter.js",
+    "prot2pept": "./bin/prot2pept.js",
     "uniprot": "./bin/uniprot.js"
   },
   "scripts": {
@@ -17,6 +18,7 @@
     "test": "NODE_OPTIONS='--experimental-vm-modules --no-warnings' yarn run jest",
     "typecheck": "yarn tsc --skipLibCheck --noEmit",
     "peptfilter": "yarn run tsx bin/peptfilter.ts",
+    "prot2pept": "yarn run tsx bin/prot2pept.ts",
     "uniprot": "yarn run tsx bin/uniprot.ts"
   },
   "dependencies": {