Skip to content

Commit

Permalink
implement prot2pept
Browse files Browse the repository at this point in the history
  • Loading branch information
bmesuere committed Jun 21, 2024
1 parent eb8fb5d commit aa5d358
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 0 deletions.
6 changes: 6 additions & 0 deletions bin/prot2pept.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env node

import { Prot2pept } from '../lib/commands/prot2pept.js';

const command = new Prot2pept();
command.run();
73 changes: 73 additions & 0 deletions lib/commands/prot2pept.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import { createInterface } from 'node:readline';
import { BaseCommand } from './base_command.js';

export class Prot2pept extends BaseCommand {

readonly description = `The prot2pept command splits each protein sequence into a list of peptides according to a given cleavage-pattern. The command expects a list of protein sequences that are passed to standard input.
The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence.
`;

constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) {
super(options);

this.program
.summary("Splits each protein sequence into a list of peptides.")
.description(this.description)
.option("-p, --pattern <regex>", "specify cleavage-pattern (regex) as the pattern after which the next peptide will be cleaved. By default, it will create tryptic peptides.", "([KR])([^P])")
}

/**
* Performance note: Just as with peptfilter, this implementation can be made faster by using line events instead of
* async iterators.
*/
async run() {
this.parseArguments();
const pattern = new RegExp(this.program.opts().pattern, "g");

let fasta = false;
let protein = [];

// buffering output makes a big difference in performance
let output = [];
let i = 0;

for await (const line of createInterface({ input: process.stdin })) {
if (i === 0 && line.startsWith(">")) {
fasta = true;
}

i++;

if (fasta) { // if we're in fasta mode, a protein could be split over multiple lines
if (line.startsWith(">")) { // if we encounter a new header, process the previous protein and output the current header
if (protein.length > 0) {
output.push(Prot2pept.splitProtein(protein.join(""), pattern));
}
output.push(line.trimEnd());
protein = [];
} else {
protein.push(line.trimEnd());
}
} else { // if we're not in fasta mode, each line is a protein sequence
output.push(Prot2pept.splitProtein(line.trimEnd(), pattern));
}

if (i % 1000 === 0) {
output.push(""); //add a newline at the end of the buffer without additional string copy
process.stdout.write(output.join("\n"));
output = [];
}
}

if (fasta) { // if in fasta mode, process the last protein
output.push(Prot2pept.splitProtein(protein.join(""), pattern));
}
output.push("");
process.stdout.write(output.join("\n"));
}

static splitProtein(line: string, pattern: RegExp): string {
return line.replaceAll(pattern, "$1\n$2").replaceAll(pattern, "$1\n$2").replaceAll("\n\n", "\n");
}
}
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"type": "module",
"bin": {
"peptfilter": "./bin/peptfilter.js",
"prot2pept": "./bin/prot2pept.js",
"uniprot": "./bin/uniprot.js"
},
"scripts": {
Expand All @@ -17,6 +18,7 @@
"test": "NODE_OPTIONS='--experimental-vm-modules --no-warnings' yarn run jest",
"typecheck": "yarn tsc --skipLibCheck --noEmit",
"peptfilter": "yarn run tsx bin/peptfilter.ts",
"prot2pept": "yarn run tsx bin/prot2pept.ts",
"uniprot": "yarn run tsx bin/uniprot.ts"
},
"dependencies": {
Expand Down

0 comments on commit aa5d358

Please sign in to comment.