Noble-Lab · ishagokhale · Apr 10, 2023 · Apr 11, 2023 · Apr 14, 2023 · Apr 17, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased]
 
+## [3.5.0] - 2023-08-16
+
+### Fixed
+
+- Don't try to assign non-existing output writer during eval mode.
+- Specifying custom residues to retrain Casanovo is now possible.
+
+## [3.4.0] - 2023-06-19
+
+### Added
+
+- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training.
+
+### Changed
+
+- We now log steps rather than epochs as units of progress during training.
+- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously.
+
+### Fixed
+
+- Correctly refer to input peak files by their full file path.
+
 ## [3.3.0] - 2023-04-04
 
 ### Added
@@ -159,8 +181,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 - Initial Casanovo version.
 
-[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...HEAD
-[3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0
+[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...HEAD
+[3.5.0]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...v3.5.0
+[3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0
+[3.3.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0
 [3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.1.0...v3.2.0
 [3.1.0]: https://github.com/Noble-Lab/casanovo/compare/v3.0.0...v3.1.0
 [3.0.0]: https://github.com/Noble-Lab/casanovo/compare/v2.1.1...v3.0.0

diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -67,7 +67,7 @@
 @click.option(
     "--output",
     help="The base output file name to store logging (extension: .log) and "
-    "(optionally) prediction results (extension: .csv).",
+    "(optionally) prediction results (extension: .mztab).",
     type=click.Path(dir_okay=False),
 )
 def main(
@@ -96,7 +96,8 @@ def main(
             f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
         )
     else:
-        output = os.path.splitext(os.path.abspath(output))[0]
+        basename, ext = os.path.splitext(os.path.abspath(output))
+        output = basename if ext.lower() in (".log", ".mztab") else output
 
     # Configure logging.
     logging.captureWarnings(True)

diff --git a/casanovo/config.py b/casanovo/config.py
@@ -50,7 +50,9 @@ class Config:
         dropout=float,
         dim_intensity=int,
         max_length=int,
+        residues=dict,
         n_log=int,
+        tb_summarywriter=str,
         warmup_iters=int,
         max_iters=int,
         learning_rate=float,

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -45,9 +45,6 @@ dropout: 0.0
 # Number of dimensions to use for encoding peak intensity
 # Projected up to ``dim_model`` by default and summed with the peak m/z encoding
 dim_intensity:
-# Option to provide a pre-trained spectrum encoder when training
-# Trained from scratch by default
-custom_encoder:
 # Max decoded peptide length
 max_length: 100
 # Amino acid and modification vocabulary to use
@@ -117,7 +114,7 @@ save_model: True
 model_save_folder_path: ""
 # Set to "False" to save the PyTorch model instance
 save_weights_only: True
-# Model checkpointing frequency in training steps
+# Model validation and checkpointing frequency in training steps
 every_n_train_steps: 50_000
 # Disable usage of a GPU (including Apple MPS):
 no_gpu: False
diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py
@@ -135,11 +135,9 @@ def set_ms_run(self, peak_filenames: List[str]) -> None:
             The input peak file name(s).
         """
         for i, filename in enumerate(natsort.natsorted(peak_filenames), 1):
+            filename = os.path.abspath(filename)
             self.metadata.append(
-                (
-                    f"ms_run[{i}]-location",
-                    Path(os.path.abspath(filename)).as_uri(),
-                ),
+                (f"ms_run[{i}]-location", Path(filename).as_uri()),
             )
             self._run_map[filename] = i
 
@@ -180,6 +178,7 @@ def save(self) -> None:
             for i, psm in enumerate(
                 natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1
             ):
+                filename, idx = os.path.abspath(psm[1][0]), psm[1][1]
                 writer.writerow(
                     [
                         "PSM",
@@ -200,7 +199,7 @@ def save(self) -> None:
                         psm[3],  # charge
                         psm[4],  # exp_mass_to_charge
                         psm[5],  # calc_mass_to_charge
-                        f"ms_run[{self._run_map[psm[1][0]]}]:{psm[1][1]}",
+                        f"ms_run[{self._run_map[filename]}]:{idx}",
                         "null",  # pre
                         "null",  # post
                         "null",  # start

diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
@@ -754,16 +754,10 @@ def validation_step(
 
         # Calculate and log amino acid and peptide match evaluation metrics from
         # the predicted peptides.
-        peptides_pred_raw, _ = self.forward(batch[0], batch[1])
-        # FIXME: Temporary fix to skip predictions with multiple stop tokens.
-        peptides_pred, peptides_true = [], []
-        for peptide_pred, peptide_true in zip(peptides_pred_raw, batch[2]):
-            if len(peptide_pred) > 0:
-                if peptide_pred[0] == "$":
-                    peptide_pred = peptide_pred[1:]  # Remove stop token.
-                if "$" not in peptide_pred and len(peptide_pred) > 0:
-                    peptides_pred.append(peptide_pred)
-                    peptides_true.append(peptide_true)
+        peptides_pred, peptides_true = [], batch[2]
+        for spectrum_preds in self.forward(batch[0], batch[1]):
+            for _, _, pred in spectrum_preds:
+                peptides_pred.append(pred)
         aa_precision, _, pep_precision = evaluate.aa_match_metrics(
             *evaluate.aa_match_batch(
                 peptides_pred, peptides_true, self.decoder._peptide_mass.masses
@@ -831,7 +825,11 @@ def on_train_epoch_end(self) -> None:
         Log the training loss at the end of each epoch.
         """
         train_loss = self.trainer.callback_metrics["CELoss"]["train"].detach()
-        self._history[-1]["train"] = train_loss
+        metrics = {
+            "step": self.trainer.global_step,
+            "train": train_loss,
+        }
+        self._history.append(metrics)
         self._log_history()
 
     def on_validation_epoch_end(self) -> None:
@@ -840,15 +838,14 @@ def on_validation_epoch_end(self) -> None:
         """
         callback_metrics = self.trainer.callback_metrics
         metrics = {
-            "epoch": self.trainer.current_epoch,
+            "step": self.trainer.global_step,
             "valid": callback_metrics["CELoss"]["valid"].detach(),
-            "valid_aa_precision": callback_metrics["aa_precision"][
-                "valid"
-            ].detach(),
-            "valid_aa_recall": callback_metrics["aa_recall"]["valid"].detach(),
-            "valid_pep_recall": callback_metrics["pep_recall"][
-                "valid"
-            ].detach(),
+            "valid_aa_precision": callback_metrics[
+                "AA precision at coverage=1"
+            ]["valid"].detach(),
+            "valid_pep_precision": callback_metrics[
+                "Peptide precision at coverage=1"
+            ]["valid"].detach(),
         }
         self._history.append(metrics)
         self._log_history()
@@ -892,35 +889,33 @@ def _log_history(self) -> None:
         Write log to console, if requested.
         """
         # Log only if all output for the current epoch is recorded.
-        if len(self._history) > 0 and len(self._history[-1]) == 6:
-            if len(self._history) == 1:
-                logger.info(
-                    "Epoch\tTrain loss\tValid loss\tAA precision\tAA recall\t"
-                    "Peptide recall"
-                )
-            metrics = self._history[-1]
-            if metrics["epoch"] % self.n_log == 0:
-                logger.info(
-                    "%i\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f",
-                    metrics["epoch"] + 1,
-                    metrics.get("train", np.nan),
-                    metrics.get("valid", np.nan),
-                    metrics.get("valid_aa_precision", np.nan),
-                    metrics.get("valid_aa_recall", np.nan),
-                    metrics.get("valid_pep_recall", np.nan),
-                )
-                if self.tb_summarywriter is not None:
-                    for descr, key in [
-                        ("loss/train_crossentropy_loss", "train"),
-                        ("loss/dev_crossentropy_loss", "valid"),
-                        ("eval/dev_aa_precision", "valid_aa_precision"),
-                        ("eval/dev_aa_recall", "valid_aa_recall"),
-                        ("eval/dev_pep_recall", "valid_pep_recall"),
-                    ]:
+        if len(self._history) == 0:
+            return
+        if len(self._history) == 1:
+            logger.info(
+                "Step\tTrain loss\tValid loss\tPeptide precision\tAA precision"
+            )
+        metrics = self._history[-1]
+        if metrics["step"] % self.n_log == 0:
+            logger.info(
+                "%i\t%.6f\t%.6f\t%.6f\t%.6f",
+                metrics["step"],
+                metrics.get("train", np.nan),
+                metrics.get("valid", np.nan),
+                metrics.get("valid_pep_precision", np.nan),
+                metrics.get("valid_aa_precision", np.nan),
+            )
+            if self.tb_summarywriter is not None:
+                for descr, key in [
+                    ("loss/train_crossentropy_loss", "train"),
+                    ("loss/val_crossentropy_loss", "valid"),
+                    ("eval/val_pep_precision", "valid_pep_precision"),
+                    ("eval/val_aa_precision", "valid_aa_precision"),
+                ]:
+                    metric_value = metrics.get(key, np.nan)
+                    if not np.isnan(metric_value):
                         self.tb_summarywriter.add_scalar(
-                            descr,
-                            metrics.get(key, np.nan),
-                            metrics["epoch"] + 1,
+                            descr, metric_value, metrics["step"]
                         )
 
     def configure_optimizers(

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -104,7 +104,6 @@ def _execute_existing(
         n_layers=config["n_layers"],
         dropout=config["dropout"],
         dim_intensity=config["dim_intensity"],
-        custom_encoder=config["custom_encoder"],
         max_length=config["max_length"],
         residues=config["residues"],
         max_charge=config["max_charge"],
@@ -124,7 +123,7 @@ def _execute_existing(
     if len(peak_filenames := _get_peak_filenames(peak_path, peak_ext)) == 0:
         logger.error("Could not find peak files from %s", peak_path)
         raise FileNotFoundError("Could not find peak files")
-    else:
+    elif out_writer is not None:
         out_writer.set_ms_run(peak_filenames)
     peak_is_index = any(
         [os.path.splitext(fn)[1] in (".h5", ".hdf5") for fn in peak_filenames]
@@ -260,7 +259,6 @@ def train(
         n_layers=config["n_layers"],
         dropout=config["dropout"],
         dim_intensity=config["dim_intensity"],
-        custom_encoder=config["custom_encoder"],
         max_length=config["max_length"],
         residues=config["residues"],
         max_charge=config["max_charge"],
@@ -307,10 +305,12 @@ def train(
         auto_select_gpus=True,
         callbacks=callbacks,
         devices=_get_devices(config["no_gpu"]),
+        enable_checkpointing=config["save_model"],
         logger=config["logger"],
         max_epochs=config["max_epochs"],
         num_sanity_val_steps=config["num_sanity_val_steps"],
         strategy=_get_strategy(),
+        val_check_interval=config["every_n_train_steps"],
     )
     # Train the model.
     trainer.fit(
@@ -344,7 +344,7 @@ def _get_peak_filenames(
     path = os.path.expanduser(path)
     path = os.path.expandvars(path)
     return [
-        fn
+        os.path.abspath(fn)
         for fn in glob.glob(path, recursive=True)
         if os.path.splitext(fn.lower())[1] in supported_ext
     ]

diff --git a/docs/faq.md b/docs/faq.md
@@ -66,3 +66,97 @@ Note that taking very frequent model snapshots will result in somewhat slower tr
 
 When saving a model snapshot, Casanovo will use the validation data to compute performance measures (training loss, validation loss, amino acid precision, and peptide precision) and print this information to the console and log file.
 After your training job is finished, you can identify the best performing model that achieves the maximum peptide and amino acid precision from the log file and use the corresponding model snapshot.
+
+**Even though I added new post-translational modifications to the configuration file, Casanovo didn't identify those peptides.**
+
+Casanovo can only make predictions using post-translational modifications (PTMs) that were included when training the model.
+If you want to add new types of PTMs, then you will need to retrain the model.
+
+The [`config.yaml` configuration file](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml) contains all amino acids and PTMs that Casanovo knows.
+By default, this includes oxidation of methionine, deamidation of asparagine and glutamine, N-terminal acetylation, N-terminal carbamylation, and an N-terminal loss of ammonia.
+(Additionally, cysteines are _always_ considered to be carbamidomethylated.)
+Simply making changes to the `residues` alphabet in the configuration file is insufficient to identify new types of PTMs with Casanovo, however.
+This is indicated by the fact that this option is not marked with `(I)` in the configuration file, which indicates options that can be modified during inference.
+Al remaining options require training a new Casanovo model.
+
+Therefore, to learn the spectral signature of previously unknown PTMs, a new Casanovo version needs to be _trained_.
+To include new PTMs in Casanovo, you need to:
+1. Update the `residues` alphabet in the configuration file accordingly.
+2. Compile a large training dataset that includes those PTMs and format this as an annotated MGF file. Note that you can include some or all of the data that was originally used to train Casanovo (see above), in addition to the data that includes your new types of PTMs.
+3. Train a new version of Casanovo on this dataset.
+
+It is unfortunately not possible to finetune a pre-trained Casanovo model to add new types of PTMs.
+Instead, such a model must be trained from scratch.
+
+**How can I generate a precision–coverage curve?**
+
+You can evaluate a trained Casanovo model compared to ground-truth peptide labels using a precision–coverage curve.
+
+1. Run Casanovo in sequencing or evaluation mode on your MS/MS data, [as described here](https://casanovo.readthedocs.io/en/latest/getting_started.html#running-casanovo).
+2. Collect the ground-truth peptide labels as well as the peptide labels predicted by Casanovo. Note that Casanovo might not report a peptide for every spectrum if the spectra are invalid (e.g. not enough peaks), so make sure that both pieces of information are correctly linked to each other (using the `spectra_ref` column in the mzTab output file produced by Casanovo).
+3. Use the following script to plot a precision–coverage curve:
+```python
+import depthcharge
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.metrics import auc
+
+from casanovo.denovo import evaluate
+
+
+# `psm_sequences` is assumed to be a DataFrame with at least the following
+# three columns:
+#   - "sequence": The ground-truth peptide labels.
+#   - "sequence_pred": The predicted peptide labels.
+#   - "search_engine_score[1]": The prediction scores.
+psm_sequences = ...  # TODO: Get the PSM information.
+
+# Sort the PSMs by descreasing prediction score.
+psm_sequences = psm_sequences.sort_values(
+    "search_engine_score[1]", ascending=False
+)
+# Find matches between the true and predicted peptide sequences.
+aa_matches_batch = evaluate.aa_match_batch(
+    psm_sequences["sequence"],
+    psm_sequences["sequence_pred"],
+    depthcharge.masses.PeptideMass("massivekb").masses,
+)
+# Calculate the peptide precision and coverage.
+peptide_matches = np.asarray([aa_match[1] for aa_match in aa_matches_batch[0]])
+precision = np.cumsum(peptide_matches) / np.arange(1, len(peptide_matches) + 1)
+coverage = np.arange(1, len(peptide_matches) + 1) / len(peptide_matches)
+# Calculate the score threshold at which peptide predictions don't fit the
+# precursor m/z tolerance anymore.
+threshold = np.argmax(psm_sequences["search_engine_score[1]"] < 0)
+
+# Print the performance values.
+print(f"Peptide precision = {precision[threshold]:.3f}")
+print(f"Coverage = {coverage[threshold]:.3f}")
+print(f"Peptide precision @ coverage=1 = {precision[-1]:.3f}")
+
+# Plot the precision–coverage curve.
+width = 4
+height = width / 1.618
+fig, ax = plt.subplots(figsize=(width, width))
+
+ax.plot(
+    coverage, precision, label=f"Casanovo AUC = {auc(coverage, precision):.3f}"
+)
+ax.scatter(
+    coverage[threshold],
+    precision[threshold],
+    s=50,
+    marker="D",
+    edgecolors="black",
+    zorder=10,
+)
+ax.set_xlim(0, 1)
+ax.set_ylim(0, 1)
+
+ax.set_xlabel("Coverage")
+ax.set_ylabel("Peptide precision")
+ax.legend(loc="lower left")
+
+plt.savefig("prec_cov.png", dpi=300, bbox_inches="tight")
+plt.close()
+```
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -72,6 +72,10 @@ To assist users, if no model file is specified Casanovo will try to download and
 
 Not all releases might have a model file included on the [Releases page](https://github.com/Noble-Lab/casanovo/releases), in which case model weights for alternative releases with the same major version number can be used.
 
+The most recent model weights for Casanovo version 3.x are currently provided under [Casanovo v3.0.0](https://github.com/Noble-Lab/casanovo/releases/tag/v3.0.0):
+- `casanovo_massivekb.ckpt`: Default Casanovo weights to use when analyzing tryptic data. These weights will be downloaded automatically if no weights are explicitly specified.
+- `casanovo_non-enzy.checkpt`: Casanovo weights to use when analyzing non-tryptic data, obtained by fine-tuning the tryptic model on multi-enzyme data. These weights need to be downloaded manually.
+
 ## Running Casanovo
 
 ```{note}