From 72f970ab4ac701ae24983788c1115f2e8105f843 Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Wed, 19 Jun 2024 13:10:01 +0200 Subject: [PATCH 1/6] implement peptfilter --- bin/peptfilter.ts | 6 +++++ lib/commands/peptfilter.ts | 53 ++++++++++++++++++++++++++++++++++++++ package.json | 2 ++ 3 files changed, 61 insertions(+) create mode 100755 bin/peptfilter.ts create mode 100644 lib/commands/peptfilter.ts diff --git a/bin/peptfilter.ts b/bin/peptfilter.ts new file mode 100755 index 00000000..007dfdba --- /dev/null +++ b/bin/peptfilter.ts @@ -0,0 +1,6 @@ +#!/usr/bin/env node + +import { Peptfilter } from '../lib/commands/peptfilter.js'; + +const command = new Peptfilter(); +command.run(); diff --git a/lib/commands/peptfilter.ts b/lib/commands/peptfilter.ts new file mode 100644 index 00000000..997801f2 --- /dev/null +++ b/lib/commands/peptfilter.ts @@ -0,0 +1,53 @@ +import { Option } from 'commander'; +import { createInterface } from 'node:readline'; +import { BaseCommand } from './base_command.js'; + +export class Peptfilter extends BaseCommand { + + readonly description = `The peptfilter command filters a list of peptides according to specific criteria. The command expects a list of peptides that are passed to standard input. + +The input should have one peptide per line. FASTA headers are preserved in the output, so that peptides remain bundled.`; + + constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) { + super(options); + + this.program + .summary("Filter peptides based on specific criteria.") + .description(this.description) + .option("--minlen ", "only retain peptides having at least this many amino acids", (d) => parseInt(d, 10), 5) + .option("--maxlen ", "only retain peptides having at most this many amino acids", (d) => parseInt(d, 10), 50) + .option("-l, --lacks ", "only retain peptides that lack all of the specified amino acids", (d) => d.split("")) + .option("-c, --contains ", "only retain peptides that contain all of the specified amino acids", (d) => d.split("")); + } + + async run() { + this.parseArguments(); + console.log(this.program.opts()) + const minLen = this.program.opts().minLen; + const maxlen = this.program.opts().maxLen; + const lacks = this.program.opts().lacks || []; + const contains = this.program.opts().contains || []; + + for await (const line of createInterface({ input: process.stdin })) { + if (line.startsWith(">")) { + process.stdout.write(line + "\n"); + continue; + } + if (Peptfilter.checkLength(line, minLen, maxlen) && Peptfilter.checkLacks(line, lacks) && Peptfilter.checkContains(line, contains)) { + process.stdout.write(line + "\n"); + } + } + } + + static checkLength(line: string, minLen: number, maxlen: number): boolean { + return line.length >= minLen && line.length <= maxlen; + } + + static checkLacks(line: string, lacks: string[]): boolean { + return lacks.every((aa: string) => !line.includes(aa)); + } + + static checkContains(line: string, contains: string[]): boolean { + return contains.every((aa: string) => line.includes(aa)); + } +} diff --git a/package.json b/package.json index 71171b87..7daaa389 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "private": false, "type": "module", "bin": { + "peptfilter": "./bin/peptfilter.js", "uniprot": "./bin/uniprot.js" }, "scripts": { @@ -15,6 +16,7 @@ "lint": "yarn run eslint", "test": "yarn run jest", "typecheck": "yarn tsc --skipLibCheck --noEmit", + "peptfilter": "yarn run tsx bin/peptfilter.ts", "uniprot": "yarn run tsx bin/uniprot.ts" }, "dependencies": { From f70a95c21176543fcf5c2f84a71009ce89895b83 Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Wed, 19 Jun 2024 14:01:09 +0200 Subject: [PATCH 2/6] write tests --- lib/commands/peptfilter.ts | 6 +- tests/commands/peptfilter.test.ts | 98 +++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 tests/commands/peptfilter.test.ts diff --git a/lib/commands/peptfilter.ts b/lib/commands/peptfilter.ts index 997801f2..48bba41f 100644 --- a/lib/commands/peptfilter.ts +++ b/lib/commands/peptfilter.ts @@ -22,9 +22,8 @@ The input should have one peptide per line. FASTA headers are preserved in the o async run() { this.parseArguments(); - console.log(this.program.opts()) - const minLen = this.program.opts().minLen; - const maxlen = this.program.opts().maxLen; + const minLen = this.program.opts().minlen; + const maxlen = this.program.opts().maxlen; const lacks = this.program.opts().lacks || []; const contains = this.program.opts().contains || []; @@ -33,6 +32,7 @@ The input should have one peptide per line. FASTA headers are preserved in the o process.stdout.write(line + "\n"); continue; } + if (Peptfilter.checkLength(line, minLen, maxlen) && Peptfilter.checkLacks(line, lacks) && Peptfilter.checkContains(line, contains)) { process.stdout.write(line + "\n"); } diff --git a/tests/commands/peptfilter.test.ts b/tests/commands/peptfilter.test.ts new file mode 100644 index 00000000..07bb7f20 --- /dev/null +++ b/tests/commands/peptfilter.test.ts @@ -0,0 +1,98 @@ +import { Peptfilter } from '../../lib/commands/peptfilter'; +import * as mock from 'mock-stdin'; + +let output: string[]; +let error: string[]; +const writeSpy = jest + .spyOn(process.stdout, "write") + .mockImplementation((data: unknown) => { output.push(data as string); return true; }); +const errorSpy = jest + .spyOn(process.stderr, "write") + .mockImplementation((data: unknown) => { error.push(data as string); return true; }); + +beforeEach(() => { + output = []; + error = []; +}); + +test('test length filter', async () => { + // min length + expect(Peptfilter.checkLength('AALER', 4, 10)).toBe(true); + expect(Peptfilter.checkLength('AALER', 5, 10)).toBe(true); + expect(Peptfilter.checkLength('AALER', 6, 10)).toBe(false); + + // max length + expect(Peptfilter.checkLength('AALER', 1, 4)).toBe(false); + expect(Peptfilter.checkLength('AALER', 1, 5)).toBe(true); + expect(Peptfilter.checkLength('AALER', 1, 6)).toBe(true); +}); + +test('test lacks filter', async () => { + expect(Peptfilter.checkLacks('AALER', ''.split(""))).toBe(true); + expect(Peptfilter.checkLacks('AALER', 'BCD'.split(""))).toBe(true); + expect(Peptfilter.checkLacks('AALER', 'A'.split(""))).toBe(false); + expect(Peptfilter.checkLacks('AALER', 'AE'.split(""))).toBe(false); +}); + +test('test contains filter', async () => { + expect(Peptfilter.checkContains('AALER', ''.split(""))).toBe(true); + expect(Peptfilter.checkContains('AALER', 'A'.split(""))).toBe(true); + expect(Peptfilter.checkContains('AALER', 'AE'.split(""))).toBe(true); + expect(Peptfilter.checkContains('AALER', 'BCD'.split(""))).toBe(false); + expect(Peptfilter.checkContains('AALER', 'AB'.split(""))).toBe(false); +}); + +test('test default filter from stdin', async () => { + const stdin = mock.stdin(); + + const command = new Peptfilter(); + const run = command.run(); + + stdin.send("AAAA\n"); + stdin.send("AAAAA\n"); + stdin.end(); + + await run; + + expect(writeSpy).toHaveBeenCalledTimes(1); + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.length).toBe(1); +}); + +test('test if it passes fasta from stdin', async () => { + const stdin = mock.stdin(); + + const command = new Peptfilter(); + const run = command.run(); + + stdin.send(">AA\n"); + stdin.send("AAA\n"); + stdin.end(); + + await run; + + expect(writeSpy).toHaveBeenCalledTimes(1); + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output[0]).toBe(">AA\n"); +}); + +test('test complex example from stdin', async () => { + const stdin = mock.stdin(); + + const command = new Peptfilter({ args: ["--minlen", "4", "--maxlen", "10", "--lacks", "B", "--contains", "A"] }); + const run = command.run(); + + stdin.send("A\n"); + stdin.send("AAAAAAAAAAA\n"); + stdin.send("AAAAB\n"); + stdin.send("BBBBB\n"); + stdin.send("CCCCC\n"); + stdin.send("CCCCCA\n"); + stdin.end(); + + await run; + + expect(writeSpy).toHaveBeenCalledTimes(1); + expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output[0]).toBe("CCCCCA\n"); +}); From a62ff53c3aa176a93bed63fee44bf0bd22d09c0e Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Thu, 20 Jun 2024 09:58:41 +0200 Subject: [PATCH 3/6] optimize performance --- lib/commands/base_command.ts | 7 ++++--- lib/commands/peptfilter.ts | 25 +++++++++++++++++++------ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/lib/commands/base_command.ts b/lib/commands/base_command.ts index 90897abe..35c7b795 100644 --- a/lib/commands/base_command.ts +++ b/lib/commands/base_command.ts @@ -1,5 +1,5 @@ import { Command } from "commander"; -import { version } from '../../package.json'; +import { readFileSync } from "fs"; /** * This is a base class which provides a common interface for all commands. @@ -11,8 +11,10 @@ import { version } from '../../package.json'; export abstract class BaseCommand { public program: Command; args: string[] | undefined; + version: string; constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) { + this.version = JSON.parse(readFileSync(new URL("../../package.json", import.meta.url), "utf8")).version; this.program = this.create(options); this.args = options?.args; } @@ -37,8 +39,7 @@ export abstract class BaseCommand { writeErr: () => { } }); } - - program.version(version); + program.version(this.version); return program; } diff --git a/lib/commands/peptfilter.ts b/lib/commands/peptfilter.ts index 48bba41f..87f28ff5 100644 --- a/lib/commands/peptfilter.ts +++ b/lib/commands/peptfilter.ts @@ -1,4 +1,3 @@ -import { Option } from 'commander'; import { createInterface } from 'node:readline'; import { BaseCommand } from './base_command.js'; @@ -20,6 +19,11 @@ The input should have one peptide per line. FASTA headers are preserved in the o .option("-c, --contains ", "only retain peptides that contain all of the specified amino acids", (d) => d.split("")); } + /** + * Performance note: this implementation takes 4 seconds to run on swissprot. It can be made faster by using line events instead of + * async iterators. This alternative implementation runs in 2.5 seconds. However, I decided that the async iterator implementation is + * both more readable and more in line with the implementation of the other commands. + */ async run() { this.parseArguments(); const minLen = this.program.opts().minlen; @@ -27,16 +31,25 @@ The input should have one peptide per line. FASTA headers are preserved in the o const lacks = this.program.opts().lacks || []; const contains = this.program.opts().contains || []; + let output = []; + let i = 0; + for await (const line of createInterface({ input: process.stdin })) { + i++; if (line.startsWith(">")) { - process.stdout.write(line + "\n"); - continue; + output.push(line); + } else if (Peptfilter.checkLength(line, minLen, maxlen) && Peptfilter.checkLacks(line, lacks) && Peptfilter.checkContains(line, contains)) { + output.push(line); } - - if (Peptfilter.checkLength(line, minLen, maxlen) && Peptfilter.checkLacks(line, lacks) && Peptfilter.checkContains(line, contains)) { - process.stdout.write(line + "\n"); + if (i % 1000 === 0) { + output.push(""); + process.stdout.write(output.join("\n")); + output = []; } } + + output.push(""); + process.stdout.write(output.join("\n")); } static checkLength(line: string, minLen: number, maxlen: number): boolean { From 72448de4008f09dda6bee6c3debee5c8c0b64eec Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Thu, 20 Jun 2024 10:21:03 +0200 Subject: [PATCH 4/6] fix tests --- jest.config.ts | 2 +- package.json | 2 +- tests/commands/peptfilter.test.ts | 8 ++++---- tests/commands/uniprot.test.ts | 1 + 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/jest.config.ts b/jest.config.ts index 9627c0f6..9b51f878 100644 --- a/jest.config.ts +++ b/jest.config.ts @@ -104,7 +104,7 @@ const config: Config = { // notifyMode: "failure-change", // A preset that is used as a base for Jest's configuration - // preset: undefined, + preset: 'ts-jest/presets/default-esm', // Run tests from one or more projects // projects: undefined, diff --git a/package.json b/package.json index 7daaa389..ddaa71bc 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ "scripts": { "build": "yarn run tsc", "lint": "yarn run eslint", - "test": "yarn run jest", + "test": "NODE_OPTIONS='--experimental-vm-modules --no-warnings' yarn run jest", "typecheck": "yarn tsc --skipLibCheck --noEmit", "peptfilter": "yarn run tsx bin/peptfilter.ts", "uniprot": "yarn run tsx bin/uniprot.ts" diff --git a/tests/commands/peptfilter.test.ts b/tests/commands/peptfilter.test.ts index 07bb7f20..ad1cfff1 100644 --- a/tests/commands/peptfilter.test.ts +++ b/tests/commands/peptfilter.test.ts @@ -1,4 +1,5 @@ import { Peptfilter } from '../../lib/commands/peptfilter'; +import { jest } from '@jest/globals'; import * as mock from 'mock-stdin'; let output: string[]; @@ -54,9 +55,8 @@ test('test default filter from stdin', async () => { await run; - expect(writeSpy).toHaveBeenCalledTimes(1); expect(errorSpy).toHaveBeenCalledTimes(0); - expect(output.length).toBe(1); + expect(output.join("").trimEnd().split("\n").length).toBe(1); }); test('test if it passes fasta from stdin', async () => { @@ -71,8 +71,8 @@ test('test if it passes fasta from stdin', async () => { await run; - expect(writeSpy).toHaveBeenCalledTimes(1); expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd().split("\n").length).toBe(1); expect(output[0]).toBe(">AA\n"); }); @@ -92,7 +92,7 @@ test('test complex example from stdin', async () => { await run; - expect(writeSpy).toHaveBeenCalledTimes(1); expect(errorSpy).toHaveBeenCalledTimes(0); + expect(output.join("").trimEnd().split("\n").length).toBe(1); expect(output[0]).toBe("CCCCCA\n"); }); diff --git a/tests/commands/uniprot.test.ts b/tests/commands/uniprot.test.ts index ca72837b..59ba4f7f 100644 --- a/tests/commands/uniprot.test.ts +++ b/tests/commands/uniprot.test.ts @@ -1,4 +1,5 @@ import { Uniprot } from '../../lib/commands/uniprot'; +import { jest } from '@jest/globals'; import * as mock from 'mock-stdin'; let output: string[]; From f5b879ff5739a024421abe800c27bf04cdfbea07 Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Thu, 20 Jun 2024 10:23:51 +0200 Subject: [PATCH 5/6] lint --- tests/commands/peptfilter.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/commands/peptfilter.test.ts b/tests/commands/peptfilter.test.ts index ad1cfff1..0318fde2 100644 --- a/tests/commands/peptfilter.test.ts +++ b/tests/commands/peptfilter.test.ts @@ -4,6 +4,7 @@ import * as mock from 'mock-stdin'; let output: string[]; let error: string[]; +// eslint-disable-next-line @typescript-eslint/no-unused-vars const writeSpy = jest .spyOn(process.stdout, "write") .mockImplementation((data: unknown) => { output.push(data as string); return true; }); From eb8fb5d48be135937e010541de0db4d331e845c6 Mon Sep 17 00:00:00 2001 From: Bart Mesuere Date: Thu, 20 Jun 2024 13:01:13 +0200 Subject: [PATCH 6/6] add comments --- lib/commands/peptfilter.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/commands/peptfilter.ts b/lib/commands/peptfilter.ts index 87f28ff5..f8435dc4 100644 --- a/lib/commands/peptfilter.ts +++ b/lib/commands/peptfilter.ts @@ -31,18 +31,19 @@ The input should have one peptide per line. FASTA headers are preserved in the o const lacks = this.program.opts().lacks || []; const contains = this.program.opts().contains || []; + // buffering output makes a big difference in performance let output = []; let i = 0; for await (const line of createInterface({ input: process.stdin })) { i++; - if (line.startsWith(">")) { + if (line.startsWith(">")) { // pass through FASTA headers output.push(line); } else if (Peptfilter.checkLength(line, minLen, maxlen) && Peptfilter.checkLacks(line, lacks) && Peptfilter.checkContains(line, contains)) { output.push(line); } if (i % 1000 === 0) { - output.push(""); + output.push(""); //add a newline at the end of the buffer without additional string copy process.stdout.write(output.join("\n")); output = []; }