Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port prot2pept to typescript #176

Merged
merged 3 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions bin/prot2pept.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env node

import { Prot2pept } from '../lib/commands/prot2pept.js';

const command = new Prot2pept();
command.run();
79 changes: 79 additions & 0 deletions lib/commands/prot2pept.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import { createInterface } from 'node:readline';
import { BaseCommand } from './base_command.js';

export class Prot2pept extends BaseCommand {

readonly description = `The prot2pept command splits each protein sequence into a list of peptides according to a given cleavage-pattern. The command expects a list of protein sequences that are passed to standard input.

The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence.
`;

constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) {
super(options);

this.program
.summary("Splits each protein sequence into a list of peptides.")
.description(this.description)
.option("-p, --pattern <regex>", "specify cleavage-pattern (regex) as the pattern after which the next peptide will be cleaved. By default, it will create tryptic peptides.", "([KR])([^P])")
}

/**
* Performance note: Just as with peptfilter, this implementation can be made faster by using line events instead of
* async iterators.
*/
async run() {
this.parseArguments();

let pattern;
try {
pattern = new RegExp(this.program.opts().pattern, "g");
} catch (e) {
this.program.error(`Your pattern was invalid: ${(e as Error).message}`);
}

let fasta = false;
let protein = [];

// buffering output makes a big difference in performance
let output = [];
let i = 0;

for await (const line of createInterface({ input: process.stdin })) {
if (i === 0 && line.startsWith(">")) {
fasta = true;
}

i++;

if (fasta) { // if we're in fasta mode, a protein could be split over multiple lines
if (line.startsWith(">")) { // if we encounter a new header, process the previous protein and output the current header
if (protein.length > 0) {
output.push(Prot2pept.splitProtein(protein.join(""), pattern));
}
output.push(line.trimEnd());
protein = [];
} else {
protein.push(line.trimEnd());
}
} else { // if we're not in fasta mode, each line is a protein sequence
output.push(Prot2pept.splitProtein(line.trimEnd(), pattern));
}

if (i % 1000 === 0) {
output.push(""); //add a newline at the end of the buffer without additional string copy
process.stdout.write(output.join("\n"));
output = [];
}
}

if (fasta) { // if in fasta mode, process the last protein
output.push(Prot2pept.splitProtein(protein.join(""), pattern));
}
output.push("");
process.stdout.write(output.join("\n"));
}

static splitProtein(line: string, pattern: RegExp): string {
return line.replaceAll(pattern, "$1\n$2").replaceAll(pattern, "$1\n$2").replaceAll("\n\n", "\n");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm probably overlooking something, but why do we perform replaceAll(pattern, "$1\n$2") twice with the same pattern?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's to catch overlapping patterns, like KRKRK

}
}
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"type": "module",
"bin": {
"peptfilter": "./bin/peptfilter.js",
"prot2pept": "./bin/prot2pept.js",
"uniprot": "./bin/uniprot.js"
},
"scripts": {
Expand All @@ -17,6 +18,7 @@
"test": "NODE_OPTIONS='--experimental-vm-modules --no-warnings' yarn run jest",
"typecheck": "yarn tsc --skipLibCheck --noEmit",
"peptfilter": "yarn run tsx bin/peptfilter.ts",
"prot2pept": "yarn run tsx bin/prot2pept.ts",
"uniprot": "yarn run tsx bin/uniprot.ts"
},
"dependencies": {
Expand Down
124 changes: 124 additions & 0 deletions tests/commands/prot2pept.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import { Prot2pept } from '../../lib/commands/prot2pept';
import { jest } from '@jest/globals';
import * as mock from 'mock-stdin';

let output: string[];
let error: string[];
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const writeSpy = jest
.spyOn(process.stdout, "write")
.mockImplementation((data: unknown) => { output.push(data as string); return true; });
const errorSpy = jest
.spyOn(process.stderr, "write")
.mockImplementation((data: unknown) => { error.push(data as string); return true; });

beforeEach(() => {
output = [];
error = [];
});

test('test single line input 1', async () => {
const stdin = mock.stdin();

const command = new Prot2pept();
const run = command.run();

stdin.send("AALTERAALTERPAALTER\n");
stdin.end();

await run;

expect(errorSpy).toHaveBeenCalledTimes(0);
expect(output.join("").trimEnd()).toBe("AALTER\nAALTERPAALTER");
});

test('test single line input 2', async () => {
const stdin = mock.stdin();

const command = new Prot2pept();
const run = command.run();

stdin.send("KRKPR\n");
stdin.end();

await run;

expect(errorSpy).toHaveBeenCalledTimes(0);
expect(output.join("").trimEnd()).toBe("K\nR\nKPR");
});

test('test multi line input', async () => {
const stdin = mock.stdin();

const command = new Prot2pept();
const run = command.run();

stdin.send("AALTERAALTERPAALTER\n");
stdin.send("AALTERAA\n");
stdin.end();

await run;

expect(errorSpy).toHaveBeenCalledTimes(0);
expect(output.join("").trimEnd()).toBe("AALTER\nAALTERPAALTER\nAALTER\nAA");
});

test('test fasta input 1', async () => {
const stdin = mock.stdin();

const command = new Prot2pept();
const run = command.run();

stdin.send(">AKA\nAALTERAALTERPAALTER\n");
stdin.end();

await run;

expect(errorSpy).toHaveBeenCalledTimes(0);
expect(output.join("").trimEnd()).toBe(">AKA\nAALTER\nAALTERPAALTER");
});

test('test fasta input 2', async () => {
const stdin = mock.stdin();

const command = new Prot2pept();
const run = command.run();

stdin.send(">AKA\nAAL\nT\nERAALTER\nP\nAALTER\n");
stdin.end();

await run;

expect(errorSpy).toHaveBeenCalledTimes(0);
expect(output.join("").trimEnd()).toBe(">AKA\nAALTER\nAALTERPAALTER");
});

test('test fasta input 3', async () => {
const stdin = mock.stdin();

const command = new Prot2pept();
const run = command.run();

stdin.send(">AKA\nAAL\nT\n>\nERAALTER\nP\nAALTER");
stdin.end();

await run;

expect(errorSpy).toHaveBeenCalledTimes(0);
expect(output.join("").trimEnd()).toBe(">AKA\nAALT\n>\nER\nAALTERPAALTER");
});

test('test custom pattern', async () => {
const stdin = mock.stdin();

const command = new Prot2pept({ args: ["--pattern", "([KR])([^A])"] });
const run = command.run();

stdin.send("AALTERAALTERPAALTER\n");
stdin.end();

await run;

expect(errorSpy).toHaveBeenCalledTimes(0);
expect(output.join("").trimEnd()).toBe("AALTERAALTER\nPAALTER");
});
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
"module": "NodeNext",
"strict": true,
"resolveJsonModule": true,
"target": "esnext"
}
}