Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Log optimizer and training metrics to CSV file #376

Merged
merged 8 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

- During training, model checkpoints will be saved at the end of each training epoch in addition to the checkpoints saved at the end of every validation run.
- Besides as a local file, model weights can be specified from a URL. Upon initial download, the weights file is cached for future re-use.
- Training and optimizer metrics can now be logged to a CSV file by setting the `log_metrics` config file option to true - the CSV file will be written to under a sub-directory of the output directory named `csv_logs`.

### Changed

- Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag.
- The `--output` option has been split into two options, `--output_dir` and `--output_root`.
- The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation.
- The `tb_summarywriter` config option is now a boolean config option, and if set to true the TensorBoard summary will be written to a sub-directory of the output directory named `tensorboard`.

### Fixed

Expand Down
2 changes: 1 addition & 1 deletion casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def sequence(

runner.predict(
peak_path,
str((output_path / output_root).with_suffix(".mztab")),
str((output_path / output_root_name).with_suffix(".mztab")),
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
evaluate=evaluate,
)
psms = runner.writer.psms
Expand Down
2 changes: 2 additions & 0 deletions casanovo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class Config:
residues=dict,
n_log=int,
tb_summarywriter=bool,
log_metrics=bool,
log_every_n_steps=int,
train_label_smoothing=float,
warmup_iters=int,
cosine_schedule_period_iters=int,
Expand Down
4 changes: 4 additions & 0 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ random_seed: 454
n_log: 1
# Whether to create tensorboard directory
tb_summarywriter: false
# Whether to create csv_logs directory
log_metrics: false
# How often to log optimizer parameters in steps
log_every_n_steps: 50
# Model validation and checkpointing frequency in training steps.
val_check_interval: 50_000

Expand Down
35 changes: 34 additions & 1 deletion casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@

import depthcharge.masses
import lightning.pytorch as pl
import lightning.pytorch.loggers
import numpy as np
import torch
from depthcharge.data import AnnotatedSpectrumIndex, SpectrumIndex
from lightning.pytorch.strategies import DDPStrategy
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor

from .. import utils
from ..config import Config
Expand Down Expand Up @@ -63,6 +64,8 @@ def __init__(
self.config = config
self.model_filename = model_filename
self.output_dir = output_dir
self.output_rootname = output_rootname
self.overwrite_ckpt_check = overwrite_ckpt_check

# Initialized later:
self.tmp_dir = None
Expand Down Expand Up @@ -105,6 +108,7 @@ def __init__(
filename=best_filename,
enable_version_counter=False,
),
LearningRateMonitor(log_momentum=True, log_weight_decay=True),
]

def __enter__(self):
Expand Down Expand Up @@ -255,7 +259,36 @@ def initialize_trainer(self, train: bool) -> None:
strategy=self._get_strategy(),
val_check_interval=self.config.val_check_interval,
check_val_every_n_epoch=None,
log_every_n_steps=self.config.get("log_every_n_steps"),
)

if self.config.get("log_metrics"):
if not self.output_dir:
logger.warning(
"Output directory not set in model runner. "
"No loss file will be created."
)
else:
csv_log_dir = "csv_logs"
if self.overwrite_ckpt_check:
utils.check_dir_file_exists(
self.output_dir,
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
csv_log_dir,
)

additional_cfg.update(
{
"logger": lightning.pytorch.loggers.CSVLogger(
self.output_dir,
version=csv_log_dir,
name=None,
),
"log_every_n_steps": self.config.get(
"log_every_n_steps"
),
}
)

trainer_cfg.update(additional_cfg)

self.trainer = pl.Trainer(**trainer_cfg)
Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,9 @@ def tiny_config(tmp_path):
"devices": None,
"random_seed": 454,
"n_log": 1,
"tb_summarywriter": None,
"tb_summarywriter": False,
"log_metrics": False,
"log_every_n_steps": 50,
"n_peaks": 150,
"min_mz": 50.0,
"max_mz": 2500.0,
Expand Down
39 changes: 39 additions & 0 deletions tests/unit_tests/test_runner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Unit tests specifically for the model_runner module."""

import shutil
from pathlib import Path

import pytest
Expand Down Expand Up @@ -282,3 +283,41 @@ def test_evaluate(
)

result_file.unlink()


def test_metrics_logging(tmp_path, mgf_small, tiny_config):
config = Config(tiny_config)
config._user_config["log_metrics"] = True
config._user_config["log_every_n_steps"] = 1
config.tb_summarywriter = True
config.max_epochs = 1

curr_model_path = tmp_path / "foo.epoch=0-step=1.ckpt"
best_model_path = tmp_path / "foo.best.ckpt"
tb_path = tmp_path / "tensorboard"
csv_path = tmp_path / "csv_logs"

with ModelRunner(
config, output_dir=tmp_path, output_rootname="foo"
) as runner:
runner.train([mgf_small], [mgf_small])

assert curr_model_path.is_file()
assert best_model_path.is_file()
assert tb_path.is_dir()
assert csv_path.is_dir()

curr_model_path.unlink()
best_model_path.unlink()
shutil.rmtree(tb_path)

with pytest.raises(FileExistsError):
with ModelRunner(
config, output_dir=tmp_path, output_rootname="foo"
) as runner:
runner.train([mgf_small], [mgf_small])

assert not curr_model_path.is_file()
assert not best_model_path.is_file()
assert not tb_path.is_dir()
assert csv_path.is_dir()