Merge pull request #74 from LucasAlegre/feature/hpo

Feature/hpo
LucasAlegre · Nov 2, 2023 · dab73c7 · dab73c7
2 parents d5bc675 + 935463b
commit dab73c7
Show file tree

Hide file tree

Showing 13 changed files with 481 additions and 67 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
     rev: 6.3.0
     hooks:
       - id: pydocstyle
-        exclude: ^(tests/)|(docs/)|(setup.py)|(examples/)|(benchmark/)
+        exclude: ^(tests/)|(docs/)|(setup.py)|(examples/)|(experiments/)
         args:
           - --source
           - --explain

diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ A tutorial on MO-Gymnasium and MORL-Baselines is also available: [![Open in Cola
 * All algorithms are automatically tested.
 * Utility functions are provided e.g. pareto pruning, experience buffers, etc.
 * Performances have been tested and reported in a reproducible manner.
-* 🔜 Hyper-parameter optimization available.
+* Hyperparameter optimization available.
 
 <!-- end features -->
 

diff --git a/docs/features/hpo.md b/docs/features/hpo.md
@@ -0,0 +1,24 @@
+# Hyperparameter optimization
+
+MORL-Baselines contains an early solution to the problem of hyperparameter optimization for MORL.
+The problem and solution are introduced and discussed in the following paper:
+[F. Felten, D. Gareev, E.-G. Talbi, and G. Danoy, “Hyperparameter Optimization for Multi-Objective Reinforcement Learning.” arXiv, Oct. 25, 2023. doi: 10.48550/arXiv.2310.16487.](https://arxiv.org/abs/2310.16487)
+
+
+A script to launch the hyperparameter sweep is available in [`benchmark/launch_experiment.py`](https://github.com/LucasAlegre/morl-baselines/experiments/hyperparameter_search/launch_sweep.py).
+
+An example usage of such script is the following:
+
+```bash
+python experiments/hyperparameter_search/launch_sweep.py \
+--algo envelope \
+--env-id minecart-v0 \
+--ref-point 0 0 -200 \
+--sweep-count 100 \
+--seed 10 \
+--num-seeds 3 \
+--config-name envelope \
+--train-hyperparams num_eval_weights_for_front:100 reset_num_timesteps:False eval_freq:10000 total_timesteps:10000
+```
+
+It will launch a HP search for Envelope Q-Leaning on minecart, using `[0, 0, -200]` as reference point for hypervolume. It will try 100 values of hyperparameters. The parameters distributions are specified in a yaml file specified by `config-name` (by default same name as the algorithm), it has to be in the directory. For each set of HP values, the algorithm will be trained on 3 different seeds, starting from 10 (so 10, 11, 12).
diff --git a/docs/index.md b/docs/index.md
@@ -70,6 +70,7 @@ features/scalarization
 features/buffers
 features/networks
 features/misc
+features/hpo
 ```
 
 ```{toctree}

diff --git a/benchmark/launch_experiment.py → experiments/benchmark/launch_experiment.py b/benchmark/launch_experiment.py → experiments/benchmark/launch_experiment.py
@@ -20,67 +20,11 @@
 from mo_gymnasium.utils import MORecordEpisodeStatistics
 
 from morl_baselines.common.evaluation import seed_everything
-from morl_baselines.multi_policy.capql.capql import CAPQL
-from morl_baselines.multi_policy.envelope.envelope import Envelope
-from morl_baselines.multi_policy.gpi_pd.gpi_pd import GPILS, GPIPD
-from morl_baselines.multi_policy.gpi_pd.gpi_pd_continuous_action import (
-    GPILSContinuousAction,
-    GPIPDContinuousAction,
+from morl_baselines.common.experiments import (
+    ALGOS,
+    ENVS_WITH_KNOWN_PARETO_FRONT,
+    StoreDict,
 )
-from morl_baselines.multi_policy.multi_policy_moqlearning.mp_mo_q_learning import (
-    MPMOQLearning,
-)
-from morl_baselines.multi_policy.pareto_q_learning.pql import PQL
-from morl_baselines.multi_policy.pcn.pcn import PCN
-from morl_baselines.multi_policy.pgmorl.pgmorl import PGMORL
-
-
-ALGOS = {
-    "pgmorl": PGMORL,
-    "envelope": Envelope,
-    "gpi_pd_continuous": GPIPDContinuousAction,
-    "gpi_pd_discrete": GPIPD,
-    "gpi_ls_continuous": GPILSContinuousAction,
-    "gpi_ls_discrete": GPILS,
-    "capql": CAPQL,
-    "mpmoql": MPMOQLearning,
-    "pcn": PCN,
-    "pql": PQL,
-    "ols": MPMOQLearning,
-    "gpi-ls": MPMOQLearning,
-}
-
-ENVS_WITH_KNOWN_PARETO_FRONT = [
-    "deep-sea-treasure-concave-v0",
-    "deep-sea-treasure-v0",
-    "minecart-v0",
-    "minecart-deterministic-v0",
-    "resource-gathering-v0",
-    "fruit-tree-v0",
-]
-
-
-class StoreDict(argparse.Action):
-    """
-    Custom argparse action for storing dict.
-    In: args1:0.0 args2:"dict(a=1)"
-    Out: {'args1': 0.0, arg2: dict(a=1)}
-
-    From RL Baselines3 Zoo
-    """
-
-    def __init__(self, option_strings, dest, nargs=None, **kwargs):
-        self._nargs = nargs
-        super().__init__(option_strings, dest, nargs=nargs, **kwargs)
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        arg_dict = {}
-        for arguments in values:
-            key = arguments.split(":")[0]
-            value = ":".join(arguments.split(":")[1:])
-            # Evaluate the string as python code
-            arg_dict[key] = eval(value)
-        setattr(namespace, self.dest, arg_dict)
 
 
 def parse_args():

diff --git a/experiments/hyperparameter_search/configs/envelope.yaml b/experiments/hyperparameter_search/configs/envelope.yaml
@@ -0,0 +1,71 @@
+method: bayes
+metric:
+  goal: maximize
+  name: avg_hypervolume
+parameters:
+  learning_rate:
+    distribution: uniform
+    min: 0.0001
+    max: 0.001
+  initial_epsilon:
+    distribution: uniform
+    min: 0.01
+    max: 1
+  final_epsilon:
+    distribution: uniform
+    min: 0.01
+    max: 1
+  epsilon_decay_steps:
+    distribution: int_uniform
+    min: 1
+    max: 100000
+  tau:
+    distribution: uniform
+    min: 0.0
+    max: 1.0
+  target_net_update_freq:
+    distribution: int_uniform
+    min: 1
+    max: 10000
+  buffer_size:
+    distribution: int_uniform
+    min: 1000
+    max: 2000000
+  net_arch:
+    value: [256, 256, 256, 256]
+  batch_size:
+    value: 32
+  learning_starts:
+    distribution: int_uniform
+    min: 1
+    max: 1000
+  gradient_updates:
+    distribution: int_uniform
+    min: 1
+    max: 10
+  gamma:
+    value: 0.98
+  max_grad_norm:
+    distribution: uniform
+    min: 0.1
+    max: 10.0
+  num_sample_w:
+    distribution: int_uniform
+    min: 2
+    max: 10
+  per_alpha:
+    distribution: uniform
+    min: 0.1
+    max: 0.9
+  initial_homotopy_lambda:
+    distribution: uniform
+    min: 0.0
+    max: 1
+  final_homotopy_lambda:
+    distribution: uniform
+    min: 0.0
+    max: 1
+  homotopy_decay_steps:
+    distribution: int_uniform
+    min: 1
+    max: 100000
diff --git a/experiments/hyperparameter_search/configs/pgmorl.yaml b/experiments/hyperparameter_search/configs/pgmorl.yaml
@@ -0,0 +1,94 @@
+method: bayes
+metric:
+  goal: maximize
+  name: avg_hypervolume
+parameters:
+  num_envs:
+    distribution: int_uniform
+    min: 2
+    max: 8
+  pop_size:
+    # distribution: int_uniform
+    # min: 4
+    # max: 10
+    # Fix the value for now as delta weight = 1 / (popsize-1)
+    value: 6
+  warmup_iterations:
+    distribution: int_uniform
+    min: 50
+    max: 100
+  steps_per_iteration:
+    distribution: int_uniform
+    min: 1000
+    max: 5000
+  evolutionary_iterations:
+    distribution: int_uniform
+    min: 10
+    max: 30
+  num_weight_candidates:
+    distribution: int_uniform
+    min: 5
+    max: 10
+  num_performance_buffer:
+    distribution: int_uniform
+    min: 50
+    max: 200
+  performance_buffer_size:
+    distribution: int_uniform
+    min: 1
+    max: 5
+  min_weight:
+    value: 0.0
+  max_weight:
+    value: 1.0
+  delta_weight:
+    # distribution: uniform
+    # min: 0.1
+    # max: 0.5
+    # Fix the value for now as delta weight = 1 / (popsize-1)
+    value: 0.2
+  gamma:
+    value: 0.995
+  num_minibatches:
+    distribution: categorical
+    values: [16, 32, 64]
+  update_epochs:
+    distribution: int_uniform
+    min: 5
+    max: 20
+  learning_rate:
+    distribution: uniform
+    min: 0.0001
+    max: 0.01
+  anneal_lr:
+    distribution: categorical
+    values: [true, false]
+  clip_coef:
+    distribution: uniform
+    min: 0.1
+    max: 1.0
+  ent_coef:
+    distribution: uniform
+    min: 0.0
+    max: 0.01
+  vf_coef:
+    distribution: uniform
+    min: 0.1
+    max: 1.0
+  clip_vloss:
+    distribution: categorical
+    values: [true, false]
+  max_grad_norm:
+    distribution: uniform
+    min: 0.1
+    max: 1.0
+  norm_adv:
+    distribution: categorical
+    values: [true, false]
+  gae:
+    distribution: categorical
+    values: [true, false]
+  gae_lambda:
+    distribution: uniform
+    min: 0.9
+    max: 0.99