Log optimizer and training metrics to CSV file (#376)

* csv logger * optimizer metrics logger * metrics logging unit tests * config item retrieval, additional requested changes * Generate new screengrabs with rich-codex * changelog update * Generate new screengrabs with rich-codex --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Noble-Lab · Sep 20, 2024 · 34c456d · 34c456d
1 parent aefa73c
commit 34c456d
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,12 +10,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 - During training, model checkpoints will be saved at the end of each training epoch in addition to the checkpoints saved at the end of every validation run.
 - Besides as a local file, model weights can be specified from a URL. Upon initial download, the weights file is cached for future re-use.
+- Training and optimizer metrics can now be logged to a CSV file by setting the `log_metrics` config file option to true - the CSV file will be written to under a sub-directory of the output directory named `csv_logs`.
 
 ### Changed
 
 - Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag.
 - The `--output` option has been split into two options, `--output_dir` and `--output_root`.
 - The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation.
+- The `tb_summarywriter` config option is now a boolean config option, and if set to true the TensorBoard summary will be written to a sub-directory of the output directory named `tensorboard`.
 
 ### Fixed
 

diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -192,7 +192,7 @@ def sequence(
 
         runner.predict(
             peak_path,
-            str((output_path / output_root).with_suffix(".mztab")),
+            str((output_path / output_root_name).with_suffix(".mztab")),
             evaluate=evaluate,
         )
         psms = runner.writer.psms

diff --git a/casanovo/config.py b/casanovo/config.py
@@ -65,6 +65,8 @@ class Config:
         residues=dict,
         n_log=int,
         tb_summarywriter=bool,
+        log_metrics=bool,
+        log_every_n_steps=int,
         train_label_smoothing=float,
         warmup_iters=int,
         cosine_schedule_period_iters=int,

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -42,6 +42,10 @@ random_seed: 454
 n_log: 1
 # Whether to create tensorboard directory
 tb_summarywriter: false
+# Whether to create csv_logs directory
+log_metrics: false
+# How often to log optimizer parameters in steps
+log_every_n_steps: 50
 # Model validation and checkpointing frequency in training steps.
 val_check_interval: 50_000
 

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -13,11 +13,12 @@
 
 import depthcharge.masses
 import lightning.pytorch as pl
+import lightning.pytorch.loggers
 import numpy as np
 import torch
 from depthcharge.data import AnnotatedSpectrumIndex, SpectrumIndex
 from lightning.pytorch.strategies import DDPStrategy
-from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
 
 from .. import utils
 from ..config import Config
@@ -63,6 +64,8 @@ def __init__(
         self.config = config
         self.model_filename = model_filename
         self.output_dir = output_dir
+        self.output_rootname = output_rootname
+        self.overwrite_ckpt_check = overwrite_ckpt_check
 
         # Initialized later:
         self.tmp_dir = None
@@ -105,6 +108,7 @@ def __init__(
                 filename=best_filename,
                 enable_version_counter=False,
             ),
+            LearningRateMonitor(log_momentum=True, log_weight_decay=True),
         ]
 
     def __enter__(self):
@@ -255,7 +259,36 @@ def initialize_trainer(self, train: bool) -> None:
                 strategy=self._get_strategy(),
                 val_check_interval=self.config.val_check_interval,
                 check_val_every_n_epoch=None,
+                log_every_n_steps=self.config.get("log_every_n_steps"),
             )
+
+            if self.config.get("log_metrics"):
+                if not self.output_dir:
+                    logger.warning(
+                        "Output directory not set in model runner. "
+                        "No loss file will be created."
+                    )
+                else:
+                    csv_log_dir = "csv_logs"
+                    if self.overwrite_ckpt_check:
+                        utils.check_dir_file_exists(
+                            self.output_dir,
+                            csv_log_dir,
+                        )
+
+                    additional_cfg.update(
+                        {
+                            "logger": lightning.pytorch.loggers.CSVLogger(
+                                self.output_dir,
+                                version=csv_log_dir,
+                                name=None,
+                            ),
+                            "log_every_n_steps": self.config.get(
+                                "log_every_n_steps"
+                            ),
+                        }
+                    )
+
             trainer_cfg.update(additional_cfg)
 
         self.trainer = pl.Trainer(**trainer_cfg)

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -224,7 +224,9 @@ def tiny_config(tmp_path):
         "devices": None,
         "random_seed": 454,
         "n_log": 1,
-        "tb_summarywriter": None,
+        "tb_summarywriter": False,
+        "log_metrics": False,
+        "log_every_n_steps": 50,
         "n_peaks": 150,
         "min_mz": 50.0,
         "max_mz": 2500.0,

diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py
@@ -1,5 +1,6 @@
 """Unit tests specifically for the model_runner module."""
 
+import shutil
 from pathlib import Path
 
 import pytest
@@ -282,3 +283,41 @@ def test_evaluate(
             )
 
     result_file.unlink()
+
+
+def test_metrics_logging(tmp_path, mgf_small, tiny_config):
+    config = Config(tiny_config)
+    config._user_config["log_metrics"] = True
+    config._user_config["log_every_n_steps"] = 1
+    config.tb_summarywriter = True
+    config.max_epochs = 1
+
+    curr_model_path = tmp_path / "foo.epoch=0-step=1.ckpt"
+    best_model_path = tmp_path / "foo.best.ckpt"
+    tb_path = tmp_path / "tensorboard"
+    csv_path = tmp_path / "csv_logs"
+
+    with ModelRunner(
+        config, output_dir=tmp_path, output_rootname="foo"
+    ) as runner:
+        runner.train([mgf_small], [mgf_small])
+
+    assert curr_model_path.is_file()
+    assert best_model_path.is_file()
+    assert tb_path.is_dir()
+    assert csv_path.is_dir()
+
+    curr_model_path.unlink()
+    best_model_path.unlink()
+    shutil.rmtree(tb_path)
+
+    with pytest.raises(FileExistsError):
+        with ModelRunner(
+            config, output_dir=tmp_path, output_rootname="foo"
+        ) as runner:
+            runner.train([mgf_small], [mgf_small])
+
+    assert not curr_model_path.is_file()
+    assert not best_model_path.is_file()
+    assert not tb_path.is_dir()
+    assert csv_path.is_dir()