From aa5d358be428479e88564b3fa0c7614de366debc Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Fri, 21 Jun 2024 13:59:11 +0200 Subject: [PATCH 1/3] implement prot2pept --- bin/prot2pept.ts | 6 ++++ lib/commands/prot2pept.ts | 73 +++++++++++++++++++++++++++++++++++++++ package.json | 2 ++ 3 files changed, 81 insertions(+) create mode 100755 bin/prot2pept.ts create mode 100644 lib/commands/prot2pept.ts diff --git a/bin/prot2pept.ts b/bin/prot2pept.ts new file mode 100755 index 00000000..d2d593be --- /dev/null +++ b/bin/prot2pept.ts @@ -0,0 +1,6 @@ +#!/usr/bin/env node + +import { Prot2pept } from '../lib/commands/prot2pept.js'; + +const command = new Prot2pept(); +command.run(); diff --git a/lib/commands/prot2pept.ts b/lib/commands/prot2pept.ts new file mode 100644 index 00000000..ac7c8fb7 --- /dev/null +++ b/lib/commands/prot2pept.ts @@ -0,0 +1,73 @@ +import { createInterface } from 'node:readline'; +import { BaseCommand } from './base_command.js'; + +export class Prot2pept extends BaseCommand { + + readonly description = `The prot2pept command splits each protein sequence into a list of peptides according to a given cleavage-pattern. The command expects a list of protein sequences that are passed to standard input. + +The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence. +`; + + constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) { + super(options); + + this.program + .summary("Splits each protein sequence into a list of peptides.") + .description(this.description) + .option("-p, --pattern ", "specify cleavage-pattern (regex) as the pattern after which the next peptide will be cleaved. By default, it will create tryptic peptides.", "([KR])([^P])") + } + + /** + * Performance note: Just as with peptfilter, this implementation can be made faster by using line events instead of + * async iterators. + */ + async run() { + this.parseArguments(); + const pattern = new RegExp(this.program.opts().pattern, "g"); + + let fasta = false; + let protein = []; + + // buffering output makes a big difference in performance + let output = []; + let i = 0; + + for await (const line of createInterface({ input: process.stdin })) { + if (i === 0 && line.startsWith(">")) { + fasta = true; + } + + i++; + + if (fasta) { // if we're in fasta mode, a protein could be split over multiple lines + if (line.startsWith(">")) { // if we encounter a new header, process the previous protein and output the current header + if (protein.length > 0) { + output.push(Prot2pept.splitProtein(protein.join(""), pattern)); + } + output.push(line.trimEnd()); + protein = []; + } else { + protein.push(line.trimEnd()); + } + } else { // if we're not in fasta mode, each line is a protein sequence + output.push(Prot2pept.splitProtein(line.trimEnd(), pattern)); + } + + if (i % 1000 === 0) { + output.push(""); //add a newline at the end of the buffer without additional string copy + process.stdout.write(output.join("\n")); + output = []; + } + } + + if (fasta) { // if in fasta mode, process the last protein + output.push(Prot2pept.splitProtein(protein.join(""), pattern)); + } + output.push(""); + process.stdout.write(output.join("\n")); + } + + static splitProtein(line: string, pattern: RegExp): string { + return line.replaceAll(pattern, "$1\n$2").replaceAll(pattern, "$1\n$2").replaceAll("\n\n", "\n"); + } +} diff --git a/package.json b/package.json index ddaa71bc..f27f3c4c 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "type": "module", "bin": { "peptfilter": "./bin/peptfilter.js", + "prot2pept": "./bin/prot2pept.js", "uniprot": "./bin/uniprot.js" }, "scripts": { @@ -17,6 +18,7 @@ "test": "NODE_OPTIONS='--experimental-vm-modules --no-warnings' yarn run jest", "typecheck": "yarn tsc --skipLibCheck --noEmit", "peptfilter": "yarn run tsx bin/peptfilter.ts", + "prot2pept": "yarn run tsx bin/prot2pept.ts", "uniprot": "yarn run tsx bin/uniprot.ts" }, "dependencies": { From c24382a1d725103d8bc3615b6c1f8630b3efcbc9 Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Fri, 21 Jun 2024 14:30:08 +0200 Subject: [PATCH 2/3] add tests --- lib/commands/prot2pept.ts | 9 ++- tests/commands/prot2pept.test.ts | 124 +++++++++++++++++++++++++++++++ tsconfig.json | 1 + 3 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 tests/commands/prot2pept.test.ts diff --git a/lib/commands/prot2pept.ts b/lib/commands/prot2pept.ts index ac7c8fb7..3b3a6b01 100644 --- a/lib/commands/prot2pept.ts +++ b/lib/commands/prot2pept.ts @@ -23,7 +23,14 @@ The input should have either one protein sequence per line or contain a FASTA fo */ async run() { this.parseArguments(); - const pattern = new RegExp(this.program.opts().pattern, "g"); + + let pattern; + try { + pattern = new RegExp(this.program.opts().pattern, "g"); + } catch (e) { + this.program.error(`Your pattern was invalid: ${(e as Error).message}`); + //process.exit(1); + } let fasta = false; let protein = []; diff --git a/tests/commands/prot2pept.test.ts b/tests/commands/prot2pept.test.ts new file mode 100644 index 00000000..ca01533a --- /dev/null +++ b/tests/commands/prot2pept.test.ts @@ -0,0 +1,124 @@ +import { Prot2pept } from '../../lib/commands/prot2pept'; +import { jest } from '@jest/globals'; +import * as mock from 'mock-stdin'; + +let output: string[]; +let error: string[]; +// eslint-disable-next-line @typescript-eslint/no-unused-vars +const writeSpy = jest + .spyOn(process.stdout, "write") + .mockImplementation((data: unknown) => { output.push(data as string); return true; }); +const errorSpy = jest + .spyOn(process.stderr, "write") + .mockImplementation((data: unknown) => { error.push(data as string); return true; }); + +beforeEach(() => { + output = []; + error = []; +}); + +test('test single line input 1', async () => { + const stdin = mock.stdin(); + + const command = new Prot2pept(); + const run = command.run(); + + stdin.send("AALTERAALTERPAALTER\n"); + stdin.end(); + + await run; + + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd()).toBe("AALTER\nAALTERPAALTER"); +}); + +test('test single line input 2', async () => { + const stdin = mock.stdin(); + + const command = new Prot2pept(); + const run = command.run(); + + stdin.send("KRKPR\n"); + stdin.end(); + + await run; + + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd()).toBe("K\nR\nKPR"); +}); + +test('test multi line input', async () => { + const stdin = mock.stdin(); + + const command = new Prot2pept(); + const run = command.run(); + + stdin.send("AALTERAALTERPAALTER\n"); + stdin.send("AALTERAA\n"); + stdin.end(); + + await run; + + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd()).toBe("AALTER\nAALTERPAALTER\nAALTER\nAA"); +}); + +test('test fasta input 1', async () => { + const stdin = mock.stdin(); + + const command = new Prot2pept(); + const run = command.run(); + + stdin.send(">AKA\nAALTERAALTERPAALTER\n"); + stdin.end(); + + await run; + + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd()).toBe(">AKA\nAALTER\nAALTERPAALTER"); +}); + +test('test fasta input 2', async () => { + const stdin = mock.stdin(); + + const command = new Prot2pept(); + const run = command.run(); + + stdin.send(">AKA\nAAL\nT\nERAALTER\nP\nAALTER\n"); + stdin.end(); + + await run; + + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd()).toBe(">AKA\nAALTER\nAALTERPAALTER"); +}); + +test('test fasta input 3', async () => { + const stdin = mock.stdin(); + + const command = new Prot2pept(); + const run = command.run(); + + stdin.send(">AKA\nAAL\nT\n>\nERAALTER\nP\nAALTER"); + stdin.end(); + + await run; + + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd()).toBe(">AKA\nAALT\n>\nER\nAALTERPAALTER"); +}); + +test('test custom pattern', async () => { + const stdin = mock.stdin(); + + const command = new Prot2pept({ args: ["--pattern", "([KR])([^A])"] }); + const run = command.run(); + + stdin.send("AALTERAALTERPAALTER\n"); + stdin.end(); + + await run; + + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd()).toBe("AALTERAALTER\nPAALTER"); +}); diff --git a/tsconfig.json b/tsconfig.json index 5c9bef88..23047235 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -8,5 +8,6 @@ "module": "NodeNext", "strict": true, "resolveJsonModule": true, + "target": "esnext" } } From f0346d34aa9d099a5f0afde1f6fd1314fe3a1e15 Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Fri, 21 Jun 2024 18:57:13 +0200 Subject: [PATCH 3/3] Update lib/commands/prot2pept.ts Co-authored-by: Pieter Verschaffelt --- lib/commands/prot2pept.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/commands/prot2pept.ts b/lib/commands/prot2pept.ts index 3b3a6b01..a64d862a 100644 --- a/lib/commands/prot2pept.ts +++ b/lib/commands/prot2pept.ts @@ -29,7 +29,6 @@ The input should have either one protein sequence per line or contain a FASTA fo pattern = new RegExp(this.program.opts().pattern, "g"); } catch (e) { this.program.error(`Your pattern was invalid: ${(e as Error).message}`); - //process.exit(1); } let fasta = false;