sony · sbsekiguchi · Apr 8, 2024 · Feb 26, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -142,6 +142,10 @@ jobs:
       - name: HyAR reproductions test
         run: |
           ./bin/test_reproductions --gpu_id -1 --base_env hybrid_env --env FakeHybridNNablaRL-v1
+      - name: DeepMind control suite reproductions test
+        run: |
+          pip install dm_control
+          ./bin/test_reproductions --gpu_id -1 --base_env dm_control --env FakeDMControlNNablaRL-v1
   copyright:
     runs-on: ubuntu-latest
     timeout-minutes: 3

diff --git a/bin/evaluate_algorithm b/bin/evaluate_algorithm
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2021,2022,2023 Sony Group Corporation.
+# Copyright 2021,2022,2023,2024 Sony Group Corporation.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -110,6 +110,24 @@ HYBRID_ENV_LIST=(
     "Platform-v0"
 )
 
+DM_CONTROL_LIST=(
+    "acrobot-swingup" 
+    "cheetah-run" 
+    "finger-turn_hard" 
+    "fish-swim" 
+    "hopper-hop" 
+    "hopper-stand" 
+    "humanoid-run" 
+    "humanoid-stand" 
+    "humanoid-walk" 
+    "pendulum-swingup" 
+    "quadruped-run" 
+    "quadruped-walk" 
+    "reacher-hard" 
+    "swimmer-swimmer6" 
+    "walker-run" 
+)
+
 GPU_ID=0
 ALGO_NAME="dqn"
 BASE_ENV_NAME="atari"
@@ -193,6 +211,9 @@ do
     if [ $BASE_ENV_NAME = "hybrid_env" ]; then
         ENV_NAME=${HYBRID_ENV_LIST[$INDEX]}
     fi
+    if [ $BASE_ENV_NAME = "dm_control" ]; then
+        ENV_NAME=${DM_CONTROL_LIST[$INDEX]}
+    fi
     echo "Start running training for: " ${ENV_NAME}
     if [ -n "$BATCH_SIZE" ]; then
         ${ROOT_DIR}/bin/train_with_seeds "${REPRODUCTION_CODE_DIR}/${ALGO_NAME}_reproduction.py" $GPU_ID $ENV_NAME $SAVE_DIR $NUM_SEEDS $BATCH_SIZE &

diff --git a/docs/source/nnablarl_api/algorithms.rst b/docs/source/nnablarl_api/algorithms.rst
@@ -331,14 +331,34 @@ SAC (ICML 2018 version)
 
 SAC-D
 ====
-.. autoclass:: nnabla_rl.algorithms.sac.SACDConfig
+.. autoclass:: nnabla_rl.algorithms.sacd.SACDConfig
    :members:
    :show-inheritance:
 
 .. autoclass:: nnabla_rl.algorithms.sacd.SACD
    :members:
    :show-inheritance:
 
+SRSAC
+====
+.. autoclass:: nnabla_rl.algorithms.srsac.SRSACConfig
+   :members:
+   :show-inheritance:
+
+.. autoclass:: nnabla_rl.algorithms.srsac.SRSAC
+   :members:
+   :show-inheritance:
+
+SRSAC(Computationally efficient ver.)
+====
+.. autoclass:: nnabla_rl.algorithms.srsac.EfficientSRSACConfig
+   :members:
+   :show-inheritance:
+
+.. autoclass:: nnabla_rl.algorithms.srsac.EfficientSRSAC
+   :members:
+   :show-inheritance:
+
 TD3
 ====
 .. autoclass:: nnabla_rl.algorithms.td3.TD3Config

diff --git a/nnabla_rl/algorithms/README.md b/nnabla_rl/algorithms/README.md
@@ -41,6 +41,7 @@ nnabla-rl offers various (deep) reinforcement learning and optimal control algor
 |[SAC](https://arxiv.org/abs/1812.05905)|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:heavy_check_mark:|
 |[SAC (ICML 2018 version)](https://arxiv.org/abs/1801.01290)|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:heavy_check_mark:|
 |[SAC-D](https://arxiv.org/abs/2206.13901)|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:heavy_check_mark:|
+|[SRSAC](https://openreview.net/pdf?id=OpC-9aBBVJe)|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:heavy_check_mark:|
 |[TD3](https://arxiv.org/abs/1802.09477)|:heavy_check_mark:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:heavy_check_mark:|
 |[TRPO](https://arxiv.org/abs/1502.05477)|:heavy_check_mark:|:x:|:heavy_check_mark:|(We will support discrete action in the future)|:x:|:x:|
 |[TRPO (ICML 2015 version)](https://arxiv.org/abs/1502.05477)|:heavy_check_mark:|:x:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|

diff --git a/nnabla_rl/algorithms/__init__.py b/nnabla_rl/algorithms/__init__.py
@@ -1,5 +1,5 @@
 # Copyright 2020,2021 Sony Corporation.
-# Copyright 2021,2022,2023 Sony Group Corporation.
+# Copyright 2021,2022,2023,2024 Sony Group Corporation.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -49,6 +49,7 @@
 from nnabla_rl.algorithms.reinforce import REINFORCE, REINFORCEConfig
 from nnabla_rl.algorithms.sac import SAC, SACConfig
 from nnabla_rl.algorithms.sacd import SACD, SACDConfig
+from nnabla_rl.algorithms.srsac import SRSAC, EfficientSRSAC, EfficientSRSACConfig, SRSACConfig
 from nnabla_rl.algorithms.td3 import TD3, TD3Config
 from nnabla_rl.algorithms.trpo import TRPO, TRPOConfig
 from nnabla_rl.algorithms.xql import XQL, XQLConfig
@@ -112,6 +113,8 @@ def get_class_of(name):
 register_algorithm(REINFORCE, REINFORCEConfig)
 register_algorithm(SAC, SACConfig)
 register_algorithm(SACD, SACDConfig)
+register_algorithm(SRSAC, SRSACConfig)
+register_algorithm(EfficientSRSAC, EfficientSRSACConfig)
 register_algorithm(TD3, TD3Config)
 register_algorithm(ICML2015TRPO, ICML2015TRPOConfig)
 register_algorithm(TRPO, TRPOConfig)

diff --git a/nnabla_rl/algorithms/sac.py b/nnabla_rl/algorithms/sac.py
@@ -1,5 +1,5 @@
 # Copyright 2020,2021 Sony Corporation.
-# Copyright 2021,2022,2023 Sony Group Corporation.
+# Copyright 2021,2022,2023,2024 Sony Group Corporation.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -244,20 +244,16 @@ def __init__(self, env_or_env_info: Union[gym.Env, EnvironmentInfo],
             self._pi = policy_builder(scope_name="pi", env_info=self._env_info, algorithm_config=self._config)
             self._pi_solver = policy_solver_builder(self._env_info, self._config)
 
-            self._temperature = MT.policy_trainers.soft_policy_trainer.AdjustableTemperature(
-                scope_name='temperature',
-                initial_value=self._config.initial_temperature)
+            self._temperature = self._setup_temperature_model()
             if not self._config.fix_temperature:
                 self._temperature_solver = temperature_solver_builder(self._env_info, self._config)
             else:
                 self._temperature_solver = None
 
             self._replay_buffer = replay_buffer_builder(self._env_info, self._config)
 
-        self._evaluation_actor = _StochasticPolicyActionSelector(
-            self._env_info, self._pi.shallowcopy(), deterministic=True)
-        self._exploration_actor = _StochasticPolicyActionSelector(
-            self._env_info, self._pi.shallowcopy(), deterministic=False)
+        self._evaluation_actor = self._setup_evaluation_actor()
+        self._exploration_actor = self._setup_exploration_actor()
 
     @eval_api
     def compute_eval_action(self, state, *, begin_of_episode=False, extra_info={}):
@@ -270,12 +266,22 @@ def _before_training_start(self, env_or_buffer):
         context.set_nnabla_context(self._config.gpu_id)
         self._environment_explorer = self._setup_environment_explorer(env_or_buffer)
         self._policy_trainer = self._setup_policy_training(env_or_buffer)
-        self._q_function_trainer = self._setup_q_function_training(
-            env_or_buffer)
+        self._q_function_trainer = self._setup_q_function_training(env_or_buffer)
+
+    def _setup_evaluation_actor(self):
+        return _StochasticPolicyActionSelector(self._env_info, self._pi.shallowcopy(), deterministic=True)
+
+    def _setup_exploration_actor(self):
+        return _StochasticPolicyActionSelector(self._env_info, self._pi.shallowcopy(), deterministic=False)
 
     def _setup_environment_explorer(self, env_or_buffer):
         return None if self._is_buffer(env_or_buffer) else self._explorer_builder(self._env_info, self._config, self)
 
+    def _setup_temperature_model(self):
+        return MT.policy_trainers.soft_policy_trainer.AdjustableTemperature(
+            scope_name='temperature',
+            initial_value=self._config.initial_temperature)
+
     def _setup_policy_training(self, env_or_buffer):
         policy_trainer_config = MT.policy_trainers.SoftPolicyTrainerConfig(
             fixed_temperature=self._config.fix_temperature,