From 210cbae6a3c9c81a5409a265d348250bb6628167 Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Mon, 10 Apr 2023 09:57:19 -0700 Subject: [PATCH 01/13] Fix val step and add unit test (#164) --- casanovo/denovo/model.py | 34 ++++++++++++++++------------------ tests/unit_tests/test_unit.py | 13 +++++++++++++ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 2f6e99a0..b32fbd96 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -754,19 +754,18 @@ def validation_step( # Calculate and log amino acid and peptide match evaluation metrics from # the predicted peptides. - peptides_pred_raw, _ = self.forward(batch[0], batch[1]) - # FIXME: Temporary fix to skip predictions with multiple stop tokens. - peptides_pred, peptides_true = [], [] - for peptide_pred, peptide_true in zip(peptides_pred_raw, batch[2]): - if len(peptide_pred) > 0: - if peptide_pred[0] == "$": - peptide_pred = peptide_pred[1:] # Remove stop token. - if "$" not in peptide_pred and len(peptide_pred) > 0: - peptides_pred.append(peptide_pred) - peptides_true.append(peptide_true) + predicted_peptide_seq = [] + true_peptide_seq = batch[2] + + for spectrum_preds in self.forward(batch[0], batch[1]): + for _, _, peptide_seq in spectrum_preds: + predicted_peptide_seq.append(peptide_seq) + aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - peptides_pred, peptides_true, self.decoder._peptide_mass.masses + predicted_peptide_seq, + true_peptide_seq, + self.decoder._peptide_mass.masses, ) ) log_args = dict(on_step=False, on_epoch=True, sync_dist=True) @@ -842,13 +841,12 @@ def on_validation_epoch_end(self) -> None: metrics = { "epoch": self.trainer.current_epoch, "valid": callback_metrics["CELoss"]["valid"].detach(), - "valid_aa_precision": callback_metrics["aa_precision"][ - "valid" - ].detach(), - "valid_aa_recall": callback_metrics["aa_recall"]["valid"].detach(), - "valid_pep_recall": callback_metrics["pep_recall"][ - "valid" - ].detach(), + "valid_aa_precision": callback_metrics[ + "AA precision at coverage=1" + ]["valid"].detach(), + "valid_pep_precision": callback_metrics[ + "Peptide precision at coverage=1" + ]["valid"].detach(), } self._history.append(metrics) self._log_history() diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index a7b0df59..09a5ab47 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -480,3 +480,16 @@ def test_spectrum_id_mzml(mzml_small, tmp_path): ): spectrum_id = str(filename), f"scan={scan_nr}" assert dataset.get_spectrum_id(i) == spectrum_id + + +def test_train_val_step_functions(): + """Test train and validation step functions operating on batches.""" + model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=4) + spectra = torch.zeros(1, 5, 2) + precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) + peptides = ["PEPK"] + batch = (spectra, precursors, peptides) + + # Check if valid loss value returned + assert model.training_step(batch) > 0 + assert model.validation_step(batch) > 0 From 1247bc5412d30491e97e02f4bbd74e386f43070f Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Tue, 11 Apr 2023 11:56:57 -0700 Subject: [PATCH 02/13] Add validation frequency option (#165) * Fix logging and checkpointing bug * Add option to validate every n steps --- CHANGELOG.md | 3 ++ casanovo/config.py | 1 + casanovo/config.yaml | 2 +- casanovo/denovo/model.py | 49 +++++++++++++++++---------------- casanovo/denovo/model_runner.py | 2 ++ 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fb8c611..b2b10e27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,11 +12,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Included the `min_peptide_len` parameter in the configuration file to restrict predictions to peptide with a minimum length. - Export multiple PSMs per spectrum using the `top_match` parameter in the configuration file. +- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. ### Changed - Calculate the amino acid scores as the average of the amino acid scores and the peptide score. - Spectra from mzML and mzXML peak files are referred to by their scan numbers in the mzTab output instead of their indexes. +- We now log steps rather than epochs as units of progress during training. +- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. ### Fixed diff --git a/casanovo/config.py b/casanovo/config.py index cae91a85..4dc93c26 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -51,6 +51,7 @@ class Config: dim_intensity=int, max_length=int, n_log=int, + tb_summarywriter=str, warmup_iters=int, max_iters=int, learning_rate=float, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index f4528978..0e5c6b95 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -117,7 +117,7 @@ save_model: True model_save_folder_path: "" # Set to "False" to save the PyTorch model instance save_weights_only: True -# Model checkpointing frequency in training steps +# Model validation and checkpointing frequency in training steps every_n_train_steps: 50_000 # Disable usage of a GPU (including Apple MPS): no_gpu: False diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index b32fbd96..78ec6f84 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -830,7 +830,11 @@ def on_train_epoch_end(self) -> None: Log the training loss at the end of each epoch. """ train_loss = self.trainer.callback_metrics["CELoss"]["train"].detach() - self._history[-1]["train"] = train_loss + metrics = { + "step": self.trainer.global_step, + "train": train_loss, + } + self._history.append(metrics) self._log_history() def on_validation_epoch_end(self) -> None: @@ -839,7 +843,7 @@ def on_validation_epoch_end(self) -> None: """ callback_metrics = self.trainer.callback_metrics metrics = { - "epoch": self.trainer.current_epoch, + "step": self.trainer.global_step, "valid": callback_metrics["CELoss"]["valid"].detach(), "valid_aa_precision": callback_metrics[ "AA precision at coverage=1" @@ -890,36 +894,35 @@ def _log_history(self) -> None: Write log to console, if requested. """ # Log only if all output for the current epoch is recorded. - if len(self._history) > 0 and len(self._history[-1]) == 6: - if len(self._history) == 1: - logger.info( - "Epoch\tTrain loss\tValid loss\tAA precision\tAA recall\t" - "Peptide recall" - ) - metrics = self._history[-1] - if metrics["epoch"] % self.n_log == 0: + if len(self._history) == 1: + logger.info( + "Step\tTrain loss\tValid loss\tPeptide precision\tAA precision" + ) + metrics = self._history[-1] + if len(self._history) > 0: + if metrics["step"] % self.n_log == 0: logger.info( - "%i\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f", - metrics["epoch"] + 1, + "%i\t%.6f\t%.6f\t%.6f\t%.6f", + metrics["step"], metrics.get("train", np.nan), metrics.get("valid", np.nan), + metrics.get("valid_pep_precision", np.nan), metrics.get("valid_aa_precision", np.nan), - metrics.get("valid_aa_recall", np.nan), - metrics.get("valid_pep_recall", np.nan), ) if self.tb_summarywriter is not None: for descr, key in [ ("loss/train_crossentropy_loss", "train"), - ("loss/dev_crossentropy_loss", "valid"), - ("eval/dev_aa_precision", "valid_aa_precision"), - ("eval/dev_aa_recall", "valid_aa_recall"), - ("eval/dev_pep_recall", "valid_pep_recall"), + ("loss/val_crossentropy_loss", "valid"), + ("eval/val_pep_precision", "valid_pep_precision"), + ("eval/val_aa_precision", "valid_aa_precision"), ]: - self.tb_summarywriter.add_scalar( - descr, - metrics.get(key, np.nan), - metrics["epoch"] + 1, - ) + metric_value = metrics.get(key, np.nan) + if metric_value is not np.nan: + self.tb_summarywriter.add_scalar( + descr, + metric_value, + metrics["step"], + ) def configure_optimizers( self, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index bfc4e1a9..b92dd780 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -307,10 +307,12 @@ def train( auto_select_gpus=True, callbacks=callbacks, devices=_get_devices(config["no_gpu"]), + enable_checkpointing=config["save_model"], logger=config["logger"], max_epochs=config["max_epochs"], num_sanity_val_steps=config["num_sanity_val_steps"], strategy=_get_strategy(), + val_check_interval=config["every_n_train_steps"], ) # Train the model. trainer.fit( From eed28bf12bb8c10788f3530ae6437742fd095753 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Fri, 14 Apr 2023 21:41:37 +0200 Subject: [PATCH 03/13] Minor refactoring + issue fix (#166) Fixes minor issues in #165. --- casanovo/denovo/model.py | 60 ++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 78ec6f84..8b3ae3e0 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -754,18 +754,13 @@ def validation_step( # Calculate and log amino acid and peptide match evaluation metrics from # the predicted peptides. - predicted_peptide_seq = [] - true_peptide_seq = batch[2] - + peptides_pred, peptides_true = [], batch[2] for spectrum_preds in self.forward(batch[0], batch[1]): - for _, _, peptide_seq in spectrum_preds: - predicted_peptide_seq.append(peptide_seq) - + for _, _, pred in spectrum_preds: + peptides_pred.append(pred) aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - predicted_peptide_seq, - true_peptide_seq, - self.decoder._peptide_mass.masses, + peptides_pred, peptides_true, self.decoder._peptide_mass.masses ) ) log_args = dict(on_step=False, on_epoch=True, sync_dist=True) @@ -894,35 +889,34 @@ def _log_history(self) -> None: Write log to console, if requested. """ # Log only if all output for the current epoch is recorded. + if len(self._history) == 0: + return if len(self._history) == 1: logger.info( "Step\tTrain loss\tValid loss\tPeptide precision\tAA precision" ) metrics = self._history[-1] - if len(self._history) > 0: - if metrics["step"] % self.n_log == 0: - logger.info( - "%i\t%.6f\t%.6f\t%.6f\t%.6f", - metrics["step"], - metrics.get("train", np.nan), - metrics.get("valid", np.nan), - metrics.get("valid_pep_precision", np.nan), - metrics.get("valid_aa_precision", np.nan), - ) - if self.tb_summarywriter is not None: - for descr, key in [ - ("loss/train_crossentropy_loss", "train"), - ("loss/val_crossentropy_loss", "valid"), - ("eval/val_pep_precision", "valid_pep_precision"), - ("eval/val_aa_precision", "valid_aa_precision"), - ]: - metric_value = metrics.get(key, np.nan) - if metric_value is not np.nan: - self.tb_summarywriter.add_scalar( - descr, - metric_value, - metrics["step"], - ) + if metrics["step"] % self.n_log == 0: + logger.info( + "%i\t%.6f\t%.6f\t%.6f\t%.6f", + metrics["step"], + metrics.get("train", np.nan), + metrics.get("valid", np.nan), + metrics.get("valid_pep_precision", np.nan), + metrics.get("valid_aa_precision", np.nan), + ) + if self.tb_summarywriter is not None: + for descr, key in [ + ("loss/train_crossentropy_loss", "train"), + ("loss/val_crossentropy_loss", "valid"), + ("eval/val_pep_precision", "valid_pep_precision"), + ("eval/val_aa_precision", "valid_aa_precision"), + ]: + metric_value = metrics.get(key, np.nan) + if not np.isnan(metric_value): + self.tb_summarywriter.add_scalar( + descr, metric_value, metrics["step"] + ) def configure_optimizers( self, From 564ea603a058e20ce372d330bf159aa0ada89e65 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 17 Apr 2023 08:39:33 +0200 Subject: [PATCH 04/13] Always use full file paths (#168) * Always use full file paths Fixes #167. * Update changelog * Formatting fix --- CHANGELOG.md | 19 +++++++++++++++---- casanovo/data/ms_io.py | 9 ++++----- casanovo/denovo/model_runner.py | 2 +- tests/unit_tests/test_unit.py | 13 +++++++++++++ 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2b10e27..1c87c3d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,20 +6,31 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +### Added + +- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. + +### Changed + +- We now log steps rather than epochs as units of progress during training. +- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. + +### Fixed + +- Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. +- Correctly refer to input peak files by their full file path. + ## [3.3.0] - 2023-04-04 ### Added - Included the `min_peptide_len` parameter in the configuration file to restrict predictions to peptide with a minimum length. - Export multiple PSMs per spectrum using the `top_match` parameter in the configuration file. -- `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. ### Changed - Calculate the amino acid scores as the average of the amino acid scores and the peptide score. - Spectra from mzML and mzXML peak files are referred to by their scan numbers in the mzTab output instead of their indexes. -- We now log steps rather than epochs as units of progress during training. -- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously. ### Fixed @@ -163,7 +174,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Initial Casanovo version. [Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...HEAD -[3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0 +[3.3.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0 [3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.1.0...v3.2.0 [3.1.0]: https://github.com/Noble-Lab/casanovo/compare/v3.0.0...v3.1.0 [3.0.0]: https://github.com/Noble-Lab/casanovo/compare/v2.1.1...v3.0.0 diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 6dfda598..e3b3a8d6 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -135,11 +135,9 @@ def set_ms_run(self, peak_filenames: List[str]) -> None: The input peak file name(s). """ for i, filename in enumerate(natsort.natsorted(peak_filenames), 1): + filename = os.path.abspath(filename) self.metadata.append( - ( - f"ms_run[{i}]-location", - Path(os.path.abspath(filename)).as_uri(), - ), + (f"ms_run[{i}]-location", Path(filename).as_uri()), ) self._run_map[filename] = i @@ -180,6 +178,7 @@ def save(self) -> None: for i, psm in enumerate( natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1 ): + filename, idx = os.path.abspath(psm[1][0]), psm[1][1] writer.writerow( [ "PSM", @@ -200,7 +199,7 @@ def save(self) -> None: psm[3], # charge psm[4], # exp_mass_to_charge psm[5], # calc_mass_to_charge - f"ms_run[{self._run_map[psm[1][0]]}]:{psm[1][1]}", + f"ms_run[{self._run_map[filename]}]:{idx}", "null", # pre "null", # post "null", # start diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b92dd780..fb5deeba 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -346,7 +346,7 @@ def _get_peak_filenames( path = os.path.expanduser(path) path = os.path.expandvars(path) return [ - fn + os.path.abspath(fn) for fn in glob.glob(path, recursive=True) if os.path.splitext(fn.lower())[1] in supported_ext ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 09a5ab47..bc0509bd 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -13,6 +13,7 @@ from casanovo import casanovo from casanovo import utils +from casanovo.data import ms_io from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score @@ -493,3 +494,15 @@ def test_train_val_step_functions(): # Check if valid loss value returned assert model.training_step(batch) > 0 assert model.validation_step(batch) > 0 + + +def test_run_map(mgf_small): + out_writer = ms_io.MztabWriter("dummy.mztab") + # Set peak file by base file name only. + out_writer.set_ms_run([os.path.basename(mgf_small.name)]) + assert os.path.basename(mgf_small.name) not in out_writer._run_map + assert os.path.abspath(mgf_small.name) in out_writer._run_map + # Set peak file by full path. + out_writer.set_ms_run([os.path.abspath(mgf_small.name)]) + assert os.path.basename(mgf_small.name) not in out_writer._run_map + assert os.path.abspath(mgf_small.name) in out_writer._run_map From b27db98686c40bfe3487ab30250a54fdba936dc9 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 17 Apr 2023 20:49:30 +0200 Subject: [PATCH 05/13] Only split off known extensions from output filename (#171) * Only split off extension if it's mzTab * Also check for .log extension * Update output format in help message --- casanovo/casanovo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index d3e42642..72b02057 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,7 +67,7 @@ @click.option( "--output", help="The base output file name to store logging (extension: .log) and " - "(optionally) prediction results (extension: .csv).", + "(optionally) prediction results (extension: .mztab).", type=click.Path(dir_okay=False), ) def main( @@ -96,7 +96,8 @@ def main( f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}", ) else: - output = os.path.splitext(os.path.abspath(output))[0] + basename, ext = os.path.splitext(os.path.abspath(output)) + output = basename if ext.lower() in (".log", ".mztab") else output # Configure logging. logging.captureWarnings(True) From 91342783c221a0b5530f971d9141132a9976123a Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 21 Jun 2023 08:30:59 +0200 Subject: [PATCH 06/13] Prepare for release v3.4.0 (#194) --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c87c3d8..bc6340e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +## [3.4.0] - 2023-06-19 + ### Added - `every_n_train_steps` parameter now controls the frequency of both validation epochs and model checkpointing during training. @@ -17,7 +19,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Fixed -- Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding. - Correctly refer to input peak files by their full file path. ## [3.3.0] - 2023-04-04 @@ -173,7 +174,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Initial Casanovo version. -[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...HEAD +[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...HEAD +[3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0 [3.3.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0 [3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.1.0...v3.2.0 [3.1.0]: https://github.com/Noble-Lab/casanovo/compare/v3.0.0...v3.1.0 From 26b08cf4ba6f09bd64a2bbddb8c1975788cdbe4e Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Sat, 24 Jun 2023 13:01:18 +0200 Subject: [PATCH 07/13] Add FAQ about using Casanovo for new PTMs (#199) * Add FAQ about using Casanovo for new PTMs * Update based on Bill's feedback --- docs/faq.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/faq.md b/docs/faq.md index 269d09bf..6ffd7e11 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -66,3 +66,24 @@ Note that taking very frequent model snapshots will result in somewhat slower tr When saving a model snapshot, Casanovo will use the validation data to compute performance measures (training loss, validation loss, amino acid precision, and peptide precision) and print this information to the console and log file. After your training job is finished, you can identify the best performing model that achieves the maximum peptide and amino acid precision from the log file and use the corresponding model snapshot. + +**Even though I added new post-translational modifications to the configuration file, Casanovo didn't identify those peptides.** + +Casanovo can only make predictions using post-translational modifications (PTMs) that were included when training the model. +If you want to add new types of PTMs, then you will need to retrain the model. + +The [`config.yaml` configuration file](https://github.com/Noble-Lab/casanovo/blob/main/casanovo/config.yaml) contains all amino acids and PTMs that Casanovo knows. +By default, this includes oxidation of methionine, deamidation of asparagine and glutamine, N-terminal acetylation, N-terminal carbamylation, and an N-terminal loss of ammonia. +(Additionally, cysteines are _always_ considered to be carbamidomethylated.) +Simply making changes to the `residues` alphabet in the configuration file is insufficient to identify new types of PTMs with Casanovo, however. +This is indicated by the fact that this option is not marked with `(I)` in the configuration file, which indicates options that can be modified during inference. +Al remaining options require training a new Casanovo model. + +Therefore, to learn the spectral signature of previously unknown PTMs, a new Casanovo version needs to be _trained_. +To include new PTMs in Casanovo, you need to: +1. Update the `residues` alphabet in the configuration file accordingly. +2. Compile a large training dataset that includes those PTMs and format this as an annotated MGF file. Note that you can include some or all of the data that was originally used to train Casanovo (see above), in addition to the data that includes your new types of PTMs. +3. Train a new version of Casanovo on this dataset. + +It is unfortunately not possible to finetune a pre-trained Casanovo model to add new types of PTMs. +Instead, such a model must be trained from scratch. From 4d3e2f9fd02e76842a03d012bb8283a49b7751a6 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Fri, 7 Jul 2023 17:29:43 +0200 Subject: [PATCH 08/13] =?UTF-8?q?Document=20how=20to=20create=20precision?= =?UTF-8?q?=E2=80=93coverage=20curve=20(#205)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/faq.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/docs/faq.md b/docs/faq.md index 6ffd7e11..ec46bdc3 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -87,3 +87,76 @@ To include new PTMs in Casanovo, you need to: It is unfortunately not possible to finetune a pre-trained Casanovo model to add new types of PTMs. Instead, such a model must be trained from scratch. + +**How can I generate a precision–coverage curve?** + +You can evaluate a trained Casanovo model compared to ground-truth peptide labels using a precision–coverage curve. + +1. Run Casanovo in sequencing or evaluation mode on your MS/MS data, [as described here](https://casanovo.readthedocs.io/en/latest/getting_started.html#running-casanovo). +2. Collect the ground-truth peptide labels as well as the peptide labels predicted by Casanovo. Note that Casanovo might not report a peptide for every spectrum if the spectra are invalid (e.g. not enough peaks), so make sure that both pieces of information are correctly linked to each other (using the `spectra_ref` column in the mzTab output file produced by Casanovo). +3. Use the following script to plot a precision–coverage curve: +```python +import depthcharge +import matplotlib.pyplot as plt +import numpy as np +from sklearn.metrics import auc + +from casanovo.denovo import evaluate + + +# `psm_sequences` is assumed to be a DataFrame with at least the following +# three columns: +# - "sequence": The ground-truth peptide labels. +# - "sequence_pred": The predicted peptide labels. +# - "search_engine_score[1]": The prediction scores. +psm_sequences = ... # TODO: Get the PSM information. + +# Sort the PSMs by descreasing prediction score. +psm_sequences = psm_sequences.sort_values( + "search_engine_score[1]", ascending=False +) +# Find matches between the true and predicted peptide sequences. +aa_matches_batch = evaluate.aa_match_batch( + psm_sequences["sequence"], + psm_sequences["sequence_pred"], + depthcharge.masses.PeptideMass("massivekb").masses, +) +# Calculate the peptide precision and coverage. +peptide_matches = np.asarray([aa_match[1] for aa_match in aa_matches_batch[0]]) +precision = np.cumsum(peptide_matches) / np.arange(1, len(peptide_matches) + 1) +coverage = np.arange(1, len(peptide_matches) + 1) / len(peptide_matches) +# Calculate the score threshold at which peptide predictions don't fit the +# precursor m/z tolerance anymore. +threshold = np.argmax(psm_sequences["search_engine_score[1]"] < 0) + +# Print the performance values. +print(f"Peptide precision = {precision[threshold]:.3f}") +print(f"Coverage = {coverage[threshold]:.3f}") +print(f"Peptide precision @ coverage=1 = {precision[-1]:.3f}") + +# Plot the precision–coverage curve. +width = 4 +height = width / 1.618 +fig, ax = plt.subplots(figsize=(width, width)) + +ax.plot( + coverage, precision, label=f"Casanovo AUC = {auc(coverage, precision):.3f}" +) +ax.scatter( + coverage[threshold], + precision[threshold], + s=50, + marker="D", + edgecolors="black", + zorder=10, +) +ax.set_xlim(0, 1) +ax.set_ylim(0, 1) + +ax.set_xlabel("Coverage") +ax.set_ylabel("Peptide precision") +ax.legend(loc="lower left") + +plt.savefig("prec_cov.png", dpi=300, bbox_inches="tight") +plt.close() +``` From fba790d0abe3f9fad1041ed4185ff90043af8c2f Mon Sep 17 00:00:00 2001 From: Melih Yilmaz <32707537+melihyilmaz@users.noreply.github.com> Date: Tue, 1 Aug 2023 09:43:45 -0700 Subject: [PATCH 09/13] Add non-enzymatic weights description to docs (#225) * Add non-enzymatic weights description to docs * Update weights information --------- Co-authored-by: Wout Bittremieux --- docs/getting_started.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/getting_started.md b/docs/getting_started.md index 2f385e2d..170cba66 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -72,6 +72,10 @@ To assist users, if no model file is specified Casanovo will try to download and Not all releases might have a model file included on the [Releases page](https://github.com/Noble-Lab/casanovo/releases), in which case model weights for alternative releases with the same major version number can be used. +The most recent model weights for Casanovo version 3.x are currently provided under [Casanovo v3.0.0](https://github.com/Noble-Lab/casanovo/releases/tag/v3.0.0): +- `casanovo_massivekb.ckpt`: Default Casanovo weights to use when analyzing tryptic data. These weights will be downloaded automatically if no weights are explicitly specified. +- `casanovo_non-enzy.checkpt`: Casanovo weights to use when analyzing non-tryptic data, obtained by fine-tuning the tryptic model on multi-enzyme data. These weights need to be downloaded manually. + ## Running Casanovo ```{note} From 4aafa736260b88a1b519a359547c3de240ccd4ba Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Tue, 8 Aug 2023 18:33:29 +0200 Subject: [PATCH 10/13] Fix running evaluation (#218) * Don't assign output files during eval mode * Update changelog --- CHANGELOG.md | 4 ++++ casanovo/denovo/model_runner.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc6340e6..6076b631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +### Fixed + +- Don't try to assign non-existing output writer during eval mode. + ## [3.4.0] - 2023-06-19 ### Added diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index fb5deeba..223c14c7 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -124,7 +124,7 @@ def _execute_existing( if len(peak_filenames := _get_peak_filenames(peak_path, peak_ext)) == 0: logger.error("Could not find peak files from %s", peak_path) raise FileNotFoundError("Could not find peak files") - else: + elif out_writer is not None: out_writer.set_ms_run(peak_filenames) peak_is_index = any( [os.path.splitext(fn)[1] in (".h5", ".hdf5") for fn in peak_filenames] From 712c278883c89a11006ce4cf873964a175f57b4e Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 16 Aug 2023 12:01:04 +0200 Subject: [PATCH 11/13] Fix custom residues in config (#229) * Fix specifying custom residues * Update changelog --- CHANGELOG.md | 1 + casanovo/config.py | 1 + tests/unit_tests/test_config.py | 16 +++++++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6076b631..dd54b8e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Fixed - Correctly refer to input peak files by their full file path. +- Specifying custom residues to retrain Casanovo is now possible. ## [3.3.0] - 2023-04-04 diff --git a/casanovo/config.py b/casanovo/config.py index 4dc93c26..0dfdaf67 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -50,6 +50,7 @@ class Config: dropout=float, dim_intensity=int, max_length=int, + residues=dict, n_log=int, tb_summarywriter=str, warmup_iters=int, diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index 8282e367..89b37e42 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -1,6 +1,4 @@ """Test configuration loading""" -import pytest - from casanovo.config import Config @@ -17,11 +15,23 @@ def test_override(tmp_path): """Test overriding the default""" yml = tmp_path / "test.yml" with yml.open("w+") as f_out: - f_out.write("random_seed: 42\ntop_match: 3") + f_out.write( + """random_seed: 42 +top_match: 3 +residues: + W: 1 + O: 2 + U: 3 + T: 4 +""" + ) config = Config(yml) assert config.random_seed == 42 assert config["random_seed"] == 42 assert not config.no_gpu assert config.top_match == 3 + assert len(config.residues) == 4 + for i, residue in enumerate("WOUT", 1): + assert config["residues"][residue] == i assert config.file == str(yml) From ea2e8751824cfc31ecb40d2dd44a499905e8805c Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Wed, 16 Aug 2023 12:04:33 +0200 Subject: [PATCH 12/13] Update changelog --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd54b8e3..a3258e1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +## [3.5.0] - 2023-08-16 + ### Fixed - Don't try to assign non-existing output writer during eval mode. +- Specifying custom residues to retrain Casanovo is now possible. ## [3.4.0] - 2023-06-19 @@ -24,7 +27,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Fixed - Correctly refer to input peak files by their full file path. -- Specifying custom residues to retrain Casanovo is now possible. ## [3.3.0] - 2023-04-04 @@ -179,7 +181,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Initial Casanovo version. -[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...HEAD +[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...HEAD +[3.5.0]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...v3.5.0 [3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0 [3.3.0]: https://github.com/Noble-Lab/casanovo/compare/v3.2.0...v3.3.0 [3.2.0]: https://github.com/Noble-Lab/casanovo/compare/v3.1.0...v3.2.0 From 8a28ccde6098face1b6e695bc31b9cda660da4ba Mon Sep 17 00:00:00 2001 From: Isha Gokhale Date: Tue, 10 Oct 2023 13:32:00 -0700 Subject: [PATCH 13/13] Removed custom_encoder option --- casanovo/config.yaml | 3 --- casanovo/denovo/model_runner.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 0e5c6b95..0a85adba 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -45,9 +45,6 @@ dropout: 0.0 # Number of dimensions to use for encoding peak intensity # Projected up to ``dim_model`` by default and summed with the peak m/z encoding dim_intensity: -# Option to provide a pre-trained spectrum encoder when training -# Trained from scratch by default -custom_encoder: # Max decoded peptide length max_length: 100 # Amino acid and modification vocabulary to use diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 223c14c7..d153ed9f 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -104,7 +104,6 @@ def _execute_existing( n_layers=config["n_layers"], dropout=config["dropout"], dim_intensity=config["dim_intensity"], - custom_encoder=config["custom_encoder"], max_length=config["max_length"], residues=config["residues"], max_charge=config["max_charge"], @@ -260,7 +259,6 @@ def train( n_layers=config["n_layers"], dropout=config["dropout"], dim_intensity=config["dim_intensity"], - custom_encoder=config["custom_encoder"], max_length=config["max_length"], residues=config["residues"], max_charge=config["max_charge"],