Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed custom_encoder option #250

Closed
wants to merge 13 commits into from
28 changes: 26 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased]

## [3.5.0] - 2023-08-16

### Fixed

- Don't try to assign non-existing output writer during eval mode.
- Specifying custom residues to retrain Casanovo is now possible.

## [3.4.0] - 2023-06-19

### Added

- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training.

### Changed

- We now log steps rather than epochs as units of progress during training.
- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously.

### Fixed

- Correctly refer to input peak files by their full file path.

## [3.3.0] - 2023-04-04

### Added
Expand Down Expand Up @@ -159,8 +181,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

- Initial Casanovo version.

[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...HEAD
[3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0
[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...HEAD
[3.5.0]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...v3.5.0
[3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0
[3.3.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0
[3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.1.0...v3.2.0
[3.1.0]: https://github.com/Noble-Lab/casanovo/compare/v3.0.0...v3.1.0
[3.0.0]: https://github.com/Noble-Lab/casanovo/compare/v2.1.1...v3.0.0
Expand Down
5 changes: 3 additions & 2 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
@click.option(
"--output",
help="The base output file name to store logging (extension: .log) and "
"(optionally) prediction results (extension: .csv).",
"(optionally) prediction results (extension: .mztab).",
type=click.Path(dir_okay=False),
)
def main(
Expand Down Expand Up @@ -96,7 +96,8 @@ def main(
f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
)
else:
output = os.path.splitext(os.path.abspath(output))[0]
basename, ext = os.path.splitext(os.path.abspath(output))
output = basename if ext.lower() in (".log", ".mztab") else output

# Configure logging.
logging.captureWarnings(True)
Expand Down
2 changes: 2 additions & 0 deletions casanovo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ class Config:
dropout=float,
dim_intensity=int,
max_length=int,
residues=dict,
n_log=int,
tb_summarywriter=str,
warmup_iters=int,
max_iters=int,
learning_rate=float,
Expand Down
5 changes: 1 addition & 4 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ dropout: 0.0
# Number of dimensions to use for encoding peak intensity
# Projected up to ``dim_model`` by default and summed with the peak m/z encoding
dim_intensity:
# Option to provide a pre-trained spectrum encoder when training
# Trained from scratch by default
custom_encoder:
# Max decoded peptide length
max_length: 100
# Amino acid and modification vocabulary to use
Expand Down Expand Up @@ -117,7 +114,7 @@ save_model: True
model_save_folder_path: ""
# Set to "False" to save the PyTorch model instance
save_weights_only: True
# Model checkpointing frequency in training steps
# Model validation and checkpointing frequency in training steps
every_n_train_steps: 50_000
# Disable usage of a GPU (including Apple MPS):
no_gpu: False
9 changes: 4 additions & 5 deletions casanovo/data/ms_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,9 @@ def set_ms_run(self, peak_filenames: List[str]) -> None:
The input peak file name(s).
"""
for i, filename in enumerate(natsort.natsorted(peak_filenames), 1):
filename = os.path.abspath(filename)
self.metadata.append(
(
f"ms_run[{i}]-location",
Path(os.path.abspath(filename)).as_uri(),
),
(f"ms_run[{i}]-location", Path(filename).as_uri()),
)
self._run_map[filename] = i

Expand Down Expand Up @@ -180,6 +178,7 @@ def save(self) -> None:
for i, psm in enumerate(
natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1
):
filename, idx = os.path.abspath(psm[1][0]), psm[1][1]
writer.writerow(
[
"PSM",
Expand All @@ -200,7 +199,7 @@ def save(self) -> None:
psm[3], # charge
psm[4], # exp_mass_to_charge
psm[5], # calc_mass_to_charge
f"ms_run[{self._run_map[psm[1][0]]}]:{psm[1][1]}",
f"ms_run[{self._run_map[filename]}]:{idx}",
"null", # pre
"null", # post
"null", # start
Expand Down
89 changes: 42 additions & 47 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,16 +754,10 @@ def validation_step(

# Calculate and log amino acid and peptide match evaluation metrics from
# the predicted peptides.
peptides_pred_raw, _ = self.forward(batch[0], batch[1])
# FIXME: Temporary fix to skip predictions with multiple stop tokens.
peptides_pred, peptides_true = [], []
for peptide_pred, peptide_true in zip(peptides_pred_raw, batch[2]):
if len(peptide_pred) > 0:
if peptide_pred[0] == "$":
peptide_pred = peptide_pred[1:] # Remove stop token.
if "$" not in peptide_pred and len(peptide_pred) > 0:
peptides_pred.append(peptide_pred)
peptides_true.append(peptide_true)
peptides_pred, peptides_true = [], batch[2]
for spectrum_preds in self.forward(batch[0], batch[1]):
for _, _, pred in spectrum_preds:
peptides_pred.append(pred)
aa_precision, _, pep_precision = evaluate.aa_match_metrics(
*evaluate.aa_match_batch(
peptides_pred, peptides_true, self.decoder._peptide_mass.masses
Expand Down Expand Up @@ -831,7 +825,11 @@ def on_train_epoch_end(self) -> None:
Log the training loss at the end of each epoch.
"""
train_loss = self.trainer.callback_metrics["CELoss"]["train"].detach()
self._history[-1]["train"] = train_loss
metrics = {
"step": self.trainer.global_step,
"train": train_loss,
}
self._history.append(metrics)
self._log_history()

def on_validation_epoch_end(self) -> None:
Expand All @@ -840,15 +838,14 @@ def on_validation_epoch_end(self) -> None:
"""
callback_metrics = self.trainer.callback_metrics
metrics = {
"epoch": self.trainer.current_epoch,
"step": self.trainer.global_step,
"valid": callback_metrics["CELoss"]["valid"].detach(),
"valid_aa_precision": callback_metrics["aa_precision"][
"valid"
].detach(),
"valid_aa_recall": callback_metrics["aa_recall"]["valid"].detach(),
"valid_pep_recall": callback_metrics["pep_recall"][
"valid"
].detach(),
"valid_aa_precision": callback_metrics[
"AA precision at coverage=1"
]["valid"].detach(),
"valid_pep_precision": callback_metrics[
"Peptide precision at coverage=1"
]["valid"].detach(),
}
self._history.append(metrics)
self._log_history()
Expand Down Expand Up @@ -892,35 +889,33 @@ def _log_history(self) -> None:
Write log to console, if requested.
"""
# Log only if all output for the current epoch is recorded.
if len(self._history) > 0 and len(self._history[-1]) == 6:
if len(self._history) == 1:
logger.info(
"Epoch\tTrain loss\tValid loss\tAA precision\tAA recall\t"
"Peptide recall"
)
metrics = self._history[-1]
if metrics["epoch"] % self.n_log == 0:
logger.info(
"%i\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f",
metrics["epoch"] + 1,
metrics.get("train", np.nan),
metrics.get("valid", np.nan),
metrics.get("valid_aa_precision", np.nan),
metrics.get("valid_aa_recall", np.nan),
metrics.get("valid_pep_recall", np.nan),
)
if self.tb_summarywriter is not None:
for descr, key in [
("loss/train_crossentropy_loss", "train"),
("loss/dev_crossentropy_loss", "valid"),
("eval/dev_aa_precision", "valid_aa_precision"),
("eval/dev_aa_recall", "valid_aa_recall"),
("eval/dev_pep_recall", "valid_pep_recall"),
]:
if len(self._history) == 0:
return
if len(self._history) == 1:
logger.info(
"Step\tTrain loss\tValid loss\tPeptide precision\tAA precision"
)
metrics = self._history[-1]
if metrics["step"] % self.n_log == 0:
logger.info(
"%i\t%.6f\t%.6f\t%.6f\t%.6f",
metrics["step"],
metrics.get("train", np.nan),
metrics.get("valid", np.nan),
metrics.get("valid_pep_precision", np.nan),
metrics.get("valid_aa_precision", np.nan),
)
if self.tb_summarywriter is not None:
for descr, key in [
("loss/train_crossentropy_loss", "train"),
("loss/val_crossentropy_loss", "valid"),
("eval/val_pep_precision", "valid_pep_precision"),
("eval/val_aa_precision", "valid_aa_precision"),
]:
metric_value = metrics.get(key, np.nan)
if not np.isnan(metric_value):
self.tb_summarywriter.add_scalar(
descr,
metrics.get(key, np.nan),
metrics["epoch"] + 1,
descr, metric_value, metrics["step"]
)

def configure_optimizers(
Expand Down
8 changes: 4 additions & 4 deletions casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def _execute_existing(
n_layers=config["n_layers"],
dropout=config["dropout"],
dim_intensity=config["dim_intensity"],
custom_encoder=config["custom_encoder"],
max_length=config["max_length"],
residues=config["residues"],
max_charge=config["max_charge"],
Expand All @@ -124,7 +123,7 @@ def _execute_existing(
if len(peak_filenames := _get_peak_filenames(peak_path, peak_ext)) == 0:
logger.error("Could not find peak files from %s", peak_path)
raise FileNotFoundError("Could not find peak files")
else:
elif out_writer is not None:
out_writer.set_ms_run(peak_filenames)
peak_is_index = any(
[os.path.splitext(fn)[1] in (".h5", ".hdf5") for fn in peak_filenames]
Expand Down Expand Up @@ -260,7 +259,6 @@ def train(
n_layers=config["n_layers"],
dropout=config["dropout"],
dim_intensity=config["dim_intensity"],
custom_encoder=config["custom_encoder"],
max_length=config["max_length"],
residues=config["residues"],
max_charge=config["max_charge"],
Expand Down Expand Up @@ -307,10 +305,12 @@ def train(
auto_select_gpus=True,
callbacks=callbacks,
devices=_get_devices(config["no_gpu"]),
enable_checkpointing=config["save_model"],
logger=config["logger"],
max_epochs=config["max_epochs"],
num_sanity_val_steps=config["num_sanity_val_steps"],
strategy=_get_strategy(),
val_check_interval=config["every_n_train_steps"],
)
# Train the model.
trainer.fit(
Expand Down Expand Up @@ -344,7 +344,7 @@ def _get_peak_filenames(
path = os.path.expanduser(path)
path = os.path.expandvars(path)
return [
fn
os.path.abspath(fn)
for fn in glob.glob(path, recursive=True)
if os.path.splitext(fn.lower())[1] in supported_ext
]
Expand Down
94 changes: 94 additions & 0 deletions docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,97 @@ Note that taking very frequent model snapshots will result in somewhat slower tr

When saving a model snapshot, Casanovo will use the validation data to compute performance measures (training loss, validation loss, amino acid precision, and peptide precision) and print this information to the console and log file.
After your training job is finished, you can identify the best performing model that achieves the maximum peptide and amino acid precision from the log file and use the corresponding model snapshot.

**Even though I added new post-translational modifications to the configuration file, Casanovo didn't identify those peptides.**

Casanovo can only make predictions using post-translational modifications (PTMs) that were included when training the model.
If you want to add new types of PTMs, then you will need to retrain the model.

The [`config.yaml` configuration file](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml) contains all amino acids and PTMs that Casanovo knows.
By default, this includes oxidation of methionine, deamidation of asparagine and glutamine, N-terminal acetylation, N-terminal carbamylation, and an N-terminal loss of ammonia.
(Additionally, cysteines are _always_ considered to be carbamidomethylated.)
Simply making changes to the `residues` alphabet in the configuration file is insufficient to identify new types of PTMs with Casanovo, however.
This is indicated by the fact that this option is not marked with `(I)` in the configuration file, which indicates options that can be modified during inference.
Al remaining options require training a new Casanovo model.

Therefore, to learn the spectral signature of previously unknown PTMs, a new Casanovo version needs to be _trained_.
To include new PTMs in Casanovo, you need to:
1. Update the `residues` alphabet in the configuration file accordingly.
2. Compile a large training dataset that includes those PTMs and format this as an annotated MGF file. Note that you can include some or all of the data that was originally used to train Casanovo (see above), in addition to the data that includes your new types of PTMs.
3. Train a new version of Casanovo on this dataset.

It is unfortunately not possible to finetune a pre-trained Casanovo model to add new types of PTMs.
Instead, such a model must be trained from scratch.

**How can I generate a precision–coverage curve?**

You can evaluate a trained Casanovo model compared to ground-truth peptide labels using a precision–coverage curve.

1. Run Casanovo in sequencing or evaluation mode on your MS/MS data, [as described here](https://casanovo.readthedocs.io/en/latest/getting_started.html#running-casanovo).
2. Collect the ground-truth peptide labels as well as the peptide labels predicted by Casanovo. Note that Casanovo might not report a peptide for every spectrum if the spectra are invalid (e.g. not enough peaks), so make sure that both pieces of information are correctly linked to each other (using the `spectra_ref` column in the mzTab output file produced by Casanovo).
3. Use the following script to plot a precision–coverage curve:
```python
import depthcharge
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import auc

from casanovo.denovo import evaluate


# `psm_sequences` is assumed to be a DataFrame with at least the following
# three columns:
# - "sequence": The ground-truth peptide labels.
# - "sequence_pred": The predicted peptide labels.
# - "search_engine_score[1]": The prediction scores.
psm_sequences = ... # TODO: Get the PSM information.

# Sort the PSMs by descreasing prediction score.
psm_sequences = psm_sequences.sort_values(
"search_engine_score[1]", ascending=False
)
# Find matches between the true and predicted peptide sequences.
aa_matches_batch = evaluate.aa_match_batch(
psm_sequences["sequence"],
psm_sequences["sequence_pred"],
depthcharge.masses.PeptideMass("massivekb").masses,
)
# Calculate the peptide precision and coverage.
peptide_matches = np.asarray([aa_match[1] for aa_match in aa_matches_batch[0]])
precision = np.cumsum(peptide_matches) / np.arange(1, len(peptide_matches) + 1)
coverage = np.arange(1, len(peptide_matches) + 1) / len(peptide_matches)
# Calculate the score threshold at which peptide predictions don't fit the
# precursor m/z tolerance anymore.
threshold = np.argmax(psm_sequences["search_engine_score[1]"] < 0)

# Print the performance values.
print(f"Peptide precision = {precision[threshold]:.3f}")
print(f"Coverage = {coverage[threshold]:.3f}")
print(f"Peptide precision @ coverage=1 = {precision[-1]:.3f}")

# Plot the precision–coverage curve.
width = 4
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, width))

ax.plot(
coverage, precision, label=f"Casanovo AUC = {auc(coverage, precision):.3f}"
)
ax.scatter(
coverage[threshold],
precision[threshold],
s=50,
marker="D",
edgecolors="black",
zorder=10,
)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)

ax.set_xlabel("Coverage")
ax.set_ylabel("Peptide precision")
ax.legend(loc="lower left")

plt.savefig("prec_cov.png", dpi=300, bbox_inches="tight")
plt.close()
```
4 changes: 4 additions & 0 deletions docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ To assist users, if no model file is specified Casanovo will try to download and

Not all releases might have a model file included on the [Releases page](https://github.com/Noble-Lab/casanovo/releases), in which case model weights for alternative releases with the same major version number can be used.

The most recent model weights for Casanovo version 3.x are currently provided under [Casanovo v3.0.0](https://github.com/Noble-Lab/casanovo/releases/tag/v3.0.0):
- `casanovo_massivekb.ckpt`: Default Casanovo weights to use when analyzing tryptic data. These weights will be downloaded automatically if no weights are explicitly specified.
- `casanovo_non-enzy.checkpt`: Casanovo weights to use when analyzing non-tryptic data, obtained by fine-tuning the tryptic model on multi-enzyme data. These weights need to be downloaded manually.

## Running Casanovo

```{note}
Expand Down
Loading
Loading