From c5ea0aa49e04f88608cc7d76ee6dcbb8befdf882 Mon Sep 17 00:00:00 2001 From: Anna Lappe <153988542+annaelisalappe@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:51:24 +0200 Subject: [PATCH] Added config option to hpo script, styling (#235) --- use-cases/eurac/hpo.py | 1 - use-cases/eurac/slurm_ray.sh | 4 ++-- use-cases/eurac/trainer.py | 7 +++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/use-cases/eurac/hpo.py b/use-cases/eurac/hpo.py index 0eaba9f0..46d5e75e 100644 --- a/use-cases/eurac/hpo.py +++ b/use-cases/eurac/hpo.py @@ -1,6 +1,5 @@ import argparse import os -from pathlib import Path from typing import Dict import matplotlib.pyplot as plt diff --git a/use-cases/eurac/slurm_ray.sh b/use-cases/eurac/slurm_ray.sh index 7fa34869..e26632c7 100644 --- a/use-cases/eurac/slurm_ray.sh +++ b/use-cases/eurac/slurm_ray.sh @@ -3,7 +3,7 @@ # Job configuration #SBATCH --job-name=ray_tune_hpo #SBATCH --account=intertwin -#SBATCH --time 01:00:00 +#SBATCH --time 02:30:00 # Resources allocation #SBATCH --cpus-per-task=24 @@ -88,7 +88,7 @@ echo All Ray workers started. # Run the Python script using Ray echo 'Starting HPO.' -python hpo.py --num_samples 8 --max_iterations 2 --ngpus $num_gpus --ncpus $num_cpus +python hpo.py --num_samples 4 --max_iterations 2 --ngpus $num_gpus --ncpus $num_cpus --pipeline_name rnn_training_pipeline # NOTE: conv_training_pipeline has not been tested # Shutdown Ray after completion ray stop \ No newline at end of file diff --git a/use-cases/eurac/trainer.py b/use-cases/eurac/trainer.py index 88ac42f5..628ac66d 100644 --- a/use-cases/eurac/trainer.py +++ b/use-cases/eurac/trainer.py @@ -213,6 +213,7 @@ def train(self): self.lr_scheduler.step(avg_val_loss) loss_history["train"].append(train_loss) loss_history["val"].append(avg_val_loss) + self.log( item=train_loss.item(), identifier="train_loss_per_epoch", @@ -496,6 +497,12 @@ def train(self): best_loss = avg_val_loss # self.model.load_state_dict(best_model_weights) + # Report training metrics of last epoch to Ray + train.report( + {"loss": avg_val_loss.item(), + "train_loss": train_loss.item()} + ) + return loss_history, metric_history def create_dataloaders(self, train_dataset, validation_dataset, test_dataset):