Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pept2lca #177

Merged
merged 16 commits into from
Aug 1, 2024
6 changes: 6 additions & 0 deletions bin/unipept.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env node

import { Unipept } from '../lib/commands/unipept.js';

const command = new Unipept();
command.run();
12 changes: 5 additions & 7 deletions lib/commands/base_command.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,14 @@ import { readFileSync } from "fs";
*/
export abstract class BaseCommand {
public program: Command;
args: string[] | undefined;
version: string;

constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) {
constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean }) {
this.version = JSON.parse(readFileSync(new URL("../../package.json", import.meta.url), "utf8")).version;
this.program = this.create(options);
this.args = options?.args;
}

abstract run(): void;
abstract run(args?: string[]): void;

/**
* Create sets up the command line program. Implementing classes can add additional options.
Expand Down Expand Up @@ -47,10 +45,10 @@ export abstract class BaseCommand {
/**
* This allows us to pass a custom list of strings as arguments to the command during testing.
*/
parseArguments() {
if (this.args) {
parseArguments(args?: string[]) {
if (args) {
// custom arg parsing to be able to inject args for testing
this.program.parse(this.args, { from: "user" });
this.program.parse(args, { from: "user" });
} else {
this.program.parse();
}
Expand Down
6 changes: 3 additions & 3 deletions lib/commands/peptfilter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export class Peptfilter extends BaseCommand {

The input should have one peptide per line. FASTA headers are preserved in the output, so that peptides remain bundled.`;

constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) {
constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean }) {
super(options);

this.program
Expand All @@ -24,8 +24,8 @@ The input should have one peptide per line. FASTA headers are preserved in the o
* async iterators. This alternative implementation runs in 2.5 seconds. However, I decided that the async iterator implementation is
* both more readable and more in line with the implementation of the other commands.
*/
async run() {
this.parseArguments();
async run(args?: string[]) {
this.parseArguments(args);
const minLen = this.program.opts().minlen;
const maxlen = this.program.opts().maxlen;
const lacks = this.program.opts().lacks || [];
Expand Down
6 changes: 3 additions & 3 deletions lib/commands/prot2pept.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export class Prot2pept extends BaseCommand {
The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence.
`;

constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) {
constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean }) {
super(options);

this.program
Expand All @@ -21,8 +21,8 @@ The input should have either one protein sequence per line or contain a FASTA fo
* Performance note: Just as with peptfilter, this implementation can be made faster by using line events instead of
* async iterators.
*/
async run() {
this.parseArguments();
async run(args?: string[]) {
this.parseArguments(args);

let pattern;
try {
Expand Down
28 changes: 28 additions & 0 deletions lib/commands/unipept.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { BaseCommand } from './base_command.js';
import { Pept2lca } from './unipept/pept2lca.js';

export class Unipept extends BaseCommand {

readonly description = `The unipept subcommands are command line wrappers around the Unipept web services.

Subcommands that start with pept expect a list of tryptic peptides as input. Subcommands that start with tax expect a list of NCBI Taxonomy Identifiers as input. Input is passed

- as separate command line arguments
- in a text file that is passed as an argument to the -i option
- to standard input

The command will give priority to the first way the input is passed, in the order as listed above. Text files and standard input should have one tryptic peptide or one NCBI Taxonomy Identifier per line.`;

constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean }) {
super(options);

this.program
.summary("Command line interface to Unipept web services.")
.description(this.description)
.addCommand(new Pept2lca().command);
}

async run(args?: string[]) {
this.parseArguments(args);
}
}
34 changes: 34 additions & 0 deletions lib/commands/unipept/pept2lca.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import { Option } from "commander";
import { UnipeptSubcommand } from "./unipept_subcommand.js";

export class Pept2lca extends UnipeptSubcommand {

readonly description = `For each tryptic peptide the unipept pept2lca command retrieves from Unipept the lowest common ancestor of the set of taxa from all UniProt entries whose protein sequence contains an exact matches to the tryptic peptide. The lowest common ancestor is based on the topology of the Unipept Taxonomy -- a cleaned up version of the NCBI Taxonomy -- and is itself a record from the NCBI Taxonomy. The command expects a list of tryptic peptides that are passed

- as separate command line arguments
- in a text file that is passed as an argument to the -i option
- to standard input

The command will give priority to the first way tryptic peptides are passed, in the order as listed above. Text files and standard input should have one tryptic peptide per line.`;

constructor() {
super("pept2lca");

this.command
.summary("Fetch taxonomic lowest common ancestor of UniProt entries that match tryptic peptides.")
.description(this.description)
.option("-e, --equate", "equate isoleucine (I) and leucine (L) when matching peptides")
.option("-a, --all", "report all information fields of NCBI Taxonomy records available in Unipept. Note that this may have a performance penalty.")
.addOption(new Option("-s --select <fields...>", "select the information fields to return. Selected fields are passed as a comma separated list of field names. Multiple -s (or --select) options may be used."))
.argument("[peptides...]", "optionally, 1 or more peptides")
.action((args, options) => this.run(args, options));
}

requiredFields(): string[] {
return ["peptide"];
}

defaultBatchSize(): number {
return 100;
}
}
154 changes: 154 additions & 0 deletions lib/commands/unipept/unipept_subcommand.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import { Command, Option } from "commander";
import { createReadStream, createWriteStream, readFileSync } from "fs";
import { createInterface } from "node:readline";
import { Interface } from "readline";
import { Formatter } from "../../formatters/formatter.js";
import { FormatterFactory } from "../../formatters/formatter_factory.js";

export abstract class UnipeptSubcommand {
public command: Command;
static readonly VALID_FORMATS = ["blast", "csv", "json", "xml"];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
options: any = {};
name: string;
user_agent: string;
host = "https://api.unipept.ugent.be";
url?: string;
formatter?: Formatter;
outputStream: NodeJS.WritableStream = process.stdout;
firstBatch = true;
selectedFields?: RegExp[];
fasta: boolean;

constructor(name: string) {
this.name = name;
const version = JSON.parse(readFileSync(new URL("../../../package.json", import.meta.url), "utf8")).version;
this.user_agent = `unipept-cli/${version}`;
this.command = this.create(name);
this.fasta = false;
}
abstract defaultBatchSize(): number;

requiredFields(): string[] {
return [];
}

create(name: string): Command {
const command = new Command(name);

command.option("-q, --quiet", "disable service messages");
command.option("-i, --input <file>", "read input from file");
command.option("-o, --output <file>", "write output to file");
command.addOption(new Option("-f, --format <format>", "define the output format").choices(UnipeptSubcommand.VALID_FORMATS).default("json"));
command.option("--host <host>", "specify the server running the Unipept web service");

// internal options
command.addOption(new Option("--no-header", "disable the header in csv output").hideHelp());
command.addOption(new Option("--batch <size>", "specify the batch size").hideHelp());

return command;
}

async run(args: string[], options: { input?: string }): Promise<void> {
this.options = options;
this.host = this.getHost();
this.url = `${this.host}/api/v2/${this.name}.json`;
this.formatter = FormatterFactory.getFormatter(this.options.format);
if (this.options.output) {
this.outputStream = createWriteStream(this.options.output);
}

let slice = [];

for await (const input of this.getInputIterator(args, options.input)) {
slice.push(input);
if (slice.length >= this.batchSize) {
await this.processBatch(slice);
slice = [];
}
}
await this.processBatch(slice);
}

async processBatch(slice: string[]): Promise<void> {
if (!this.formatter) throw new Error("Formatter not set");

const r = await fetch(this.url as string, {
method: "POST",
body: this.constructRequestBody(slice),
headers: {
"Accept-Encoding": "gzip",
"User-Agent": this.user_agent,
}
});
const result = await r.json();

if (this.firstBatch && this.options.header) {
this.firstBatch = false;
this.outputStream.write(this.formatter.header(result, this.fasta));
}

this.outputStream.write(this.formatter.format(result, this.fasta));
}

private constructRequestBody(slice: string[]): URLSearchParams {
const names = this.getSelectedFields().length === 0 || this.getSelectedFields().some(regex => regex.toString().includes("name") || regex.toString().includes(".*$"));
return new URLSearchParams({
input: JSON.stringify(slice),
equate_il: this.options.equate,
extra: this.options.all,
names: this.options.all && names
});
}

private getSelectedFields(): RegExp[] {
if (this.selectedFields) return this.selectedFields;

const fields = (this.options.select as string[])?.flatMap(f => f.split(",")) ?? [];
if (this.fasta && fields.length > 0) {
fields.push(...this.requiredFields());
}
this.selectedFields = fields.map(f => this.globToRegex(f));

return this.selectedFields;
}

private get batchSize(): number {
if (this.options.batch) {
return +this.options.batch;
} else {
return this.defaultBatchSize();
}
}

/**
* Returns an input iterator to use for the request.
* - if arguments are given, use arguments
* - if an input file is given, use the file
* - otherwise, use standard input
*/
private getInputIterator(args: string[], input?: string): string[] | Interface {
if (args.length > 0) {
return args;
} else if (input) {
return createInterface({ input: createReadStream(input) });
} else {
return createInterface({ input: process.stdin })
}
}

private getHost(): string {
const host = this.options.host || this.host;

// add http:// if needed
if (host.startsWith("http://") || host.startsWith("https://")) {
return host;
} else {
return `http://${host}`;
}
}

private globToRegex(glob: string): RegExp {
return new RegExp(`^${glob.replace(/\*/g, ".*")}$`);
}
}
6 changes: 3 additions & 3 deletions lib/commands/uniprot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The command will give priority to the first way UniProt Accession Numbers are pa

The uniprot command yields just the protein sequences as a default, but can return several formats.`;

constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean, args?: string[] }) {
constructor(options?: { exitOverride?: boolean, suppressOutput?: boolean }) {
super(options);

this.program
Expand All @@ -26,8 +26,8 @@ The uniprot command yields just the protein sequences as a default, but can retu
.addOption(new Option("-f, --format <format>", `output format`).choices(Uniprot.VALID_FORMATS).default("sequence"));
}

async run() {
this.parseArguments();
async run(args?: string[]) {
this.parseArguments(args);
const format = this.program.opts().format;
const accessions = this.program.args;

Expand Down
21 changes: 21 additions & 0 deletions lib/formatters/csv_formatter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { Formatter } from "./formatter.js";
import { stringify } from "csv-stringify/sync";

export class CSVFormatter extends Formatter {

header(sampleData: { [key: string]: string }[], fastaMapper?: boolean | undefined): string {
return stringify([this.getKeys(sampleData, fastaMapper)]);
}

footer(): string {
return "";
}

convert(data: object[]): string {
return stringify(data);
}

getKeys(data: { [key: string]: string }[], fastaMapper?: boolean | undefined): string[] {
return fastaMapper ? ["fasta_header", ...Object.keys(data[0])] : Object.keys(data[0]);
}
}
18 changes: 18 additions & 0 deletions lib/formatters/formatter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
export abstract class Formatter {

abstract header(sampleData: object, fastaMapper?: boolean): string;
abstract footer(): string;
abstract convert(data: object[], first?: boolean): string;

format(data: object[], fastaMapper?: boolean, first?: boolean): string {
if (fastaMapper) {
data = this.integrateFastaHeaders(data, fastaMapper);
}
return this.convert(data, first);
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
integrateFastaHeaders(data: object[], fastaMapper: boolean): object[] {
return data;
}
}
11 changes: 11 additions & 0 deletions lib/formatters/formatter_factory.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { CSVFormatter } from "./csv_formatter.js";
import { Formatter } from "./formatter.js";

export class FormatterFactory {
static getFormatter(name: string): Formatter {
if (name === "csv") {
return new CSVFormatter();
}
return new CSVFormatter();
}
}
5 changes: 4 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"bin": {
"peptfilter": "./bin/peptfilter.js",
"prot2pept": "./bin/prot2pept.js",
"unipept": "./bin/unipept.js",
"uniprot": "./bin/uniprot.js"
},
"scripts": {
Expand All @@ -19,10 +20,12 @@
"typecheck": "yarn tsc --skipLibCheck --noEmit",
"peptfilter": "yarn run tsx bin/peptfilter.ts",
"prot2pept": "yarn run tsx bin/prot2pept.ts",
"unipept": "yarn run tsx bin/unipept.ts",
"uniprot": "yarn run tsx bin/uniprot.ts"
},
"dependencies": {
"commander": "^12.1.0"
"commander": "^12.1.0",
"csv-stringify": "^6.5.0"
},
"devDependencies": {
"@eslint/js": "^9.5.0",
Expand Down
Loading