diff --git a/ding/model/template/qvac.py b/ding/model/template/qvac.py new file mode 100644 index 0000000000..e9ebea4aac --- /dev/null +++ b/ding/model/template/qvac.py @@ -0,0 +1,361 @@ +from typing import Union, Dict, Optional +from easydict import EasyDict +import numpy as np +import torch +import torch.nn as nn + +from ding.utils import SequenceType, squeeze, MODEL_REGISTRY +from ..common import RegressionHead, ReparameterizationHead, DiscreteHead, MultiHead, \ + FCEncoder, ConvEncoder + + +@MODEL_REGISTRY.register('continuous_qvac') +class ContinuousQVAC(nn.Module): + """ + Overview: + The neural network and computation graph of algorithms related to Actor-Critic that have both Q-value and V-value critic, such as \ + IQL. This model now supports continuous and hybrid action space. The ContinuousQVAC is composed of \ + four parts: ``actor_encoder``, ``critic_encoder``, ``actor_head`` and ``critic_head``. Encoders are used to \ + extract the feature from various observation. Heads are used to predict corresponding Q-value and V-value or action logit. \ + In high-dimensional observation space like 2D image, we often use a shared encoder for both ``actor_encoder`` \ + and ``critic_encoder``. In low-dimensional observation space like 1D vector, we often use different encoders. + Interfaces: + ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` + """ + mode = ['compute_actor', 'compute_critic'] + + def __init__( + self, + obs_shape: Union[int, SequenceType], + action_shape: Union[int, SequenceType, EasyDict], + action_space: str, + twin_critic: bool = False, + actor_head_hidden_size: int = 64, + actor_head_layer_num: int = 1, + critic_head_hidden_size: int = 64, + critic_head_layer_num: int = 1, + activation: Optional[nn.Module] = nn.SiLU(), #nn.ReLU(), + norm_type: Optional[str] = None, + encoder_hidden_size_list: Optional[SequenceType] = None, + share_encoder: Optional[bool] = False, + ) -> None: + """ + Overview: + Initailize the ContinuousQVAC Model according to input arguments. + Arguments: + - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). + - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \ + EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). + - action_space (:obj:`str`): The type of action space, including [``regression``, ``reparameterization``, \ + ``hybrid``], ``regression`` is used for DDPG/TD3, ``reparameterization`` is used for SAC and \ + ``hybrid`` for PADDPG. + - twin_critic (:obj:`bool`): Whether to use twin critic, one of tricks in TD3. + - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. + - actor_head_layer_num (:obj:`int`): The num of layers used in the actor network to compute action. + - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head. + - critic_head_layer_num (:obj:`int`): The num of layers used in the critic network to compute Q-value. + - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \ + after each FC layer, if ``None`` then default set to ``nn.ReLU()``. + - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ + see ``ding.torch_utils.network`` for more details. + - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ + the last element must match ``head_hidden_size``, this argument is only used in image observation. + - share_encoder (:obj:`Optional[bool]`): Whether to share encoder between actor and critic. + """ + super(ContinuousQVAC, self).__init__() + obs_shape: int = squeeze(obs_shape) + action_shape = squeeze(action_shape) + self.action_shape = action_shape + self.action_space = action_space + assert self.action_space in ['regression', 'reparameterization', 'hybrid'], self.action_space + + # encoder + self.share_encoder = share_encoder + if np.isscalar(obs_shape) or len(obs_shape) == 1: + assert not self.share_encoder, "Vector observation doesn't need share encoder." + assert encoder_hidden_size_list is None, "Vector obs encoder only uses one layer nn.Linear" + # Because there is already a layer nn.Linear in the head, so we use nn.Identity here to keep + # compatible with the image observation and avoid adding an extra layer nn.Linear. + self.actor_encoder = nn.Identity() + self.critic_encoder = nn.Identity() + encoder_output_size = obs_shape + elif len(obs_shape) == 3: + + def setup_conv_encoder(): + kernel_size = [3 for _ in range(len(encoder_hidden_size_list))] + stride = [2] + [1 for _ in range(len(encoder_hidden_size_list) - 1)] + return ConvEncoder( + obs_shape, + encoder_hidden_size_list, + activation=activation, + norm_type=norm_type, + kernel_size=kernel_size, + stride=stride + ) + + if self.share_encoder: + encoder = setup_conv_encoder() + self.actor_encoder = self.critic_encoder = encoder + else: + self.actor_encoder = setup_conv_encoder() + self.critic_encoder = setup_conv_encoder() + encoder_output_size = self.actor_encoder.output_size + else: + raise RuntimeError("not support observation shape: {}".format(obs_shape)) + # head + if self.action_space == 'regression': # DDPG, TD3 + self.actor_head = nn.Sequential( + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, + RegressionHead( + actor_head_hidden_size, + action_shape, + actor_head_layer_num, + final_tanh=True, + activation=activation, + norm_type=norm_type + ) + ) + elif self.action_space == 'reparameterization': # SAC + self.actor_head = nn.Sequential( + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, + ReparameterizationHead( + actor_head_hidden_size, + action_shape, + actor_head_layer_num, + sigma_type='conditioned', + activation=activation, + norm_type=norm_type + ) + ) + elif self.action_space == 'hybrid': # PADDPG + # hybrid action space: action_type(discrete) + action_args(continuous), + # such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])} + action_shape.action_args_shape = squeeze(action_shape.action_args_shape) + action_shape.action_type_shape = squeeze(action_shape.action_type_shape) + actor_action_args = nn.Sequential( + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, + RegressionHead( + actor_head_hidden_size, + action_shape.action_args_shape, + actor_head_layer_num, + final_tanh=True, + activation=activation, + norm_type=norm_type + ) + ) + actor_action_type = nn.Sequential( + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, + DiscreteHead( + actor_head_hidden_size, + action_shape.action_type_shape, + actor_head_layer_num, + activation=activation, + norm_type=norm_type, + ) + ) + self.actor_head = nn.ModuleList([actor_action_type, actor_action_args]) + + self.twin_critic = twin_critic + if self.action_space == 'hybrid': + critic_q_input_size = encoder_output_size + action_shape.action_type_shape + action_shape.action_args_shape + critic_v_input_size = encoder_output_size + else: + critic_q_input_size = encoder_output_size + action_shape + critic_v_input_size = encoder_output_size + if self.twin_critic: + self.critic_q_head = nn.ModuleList() + self.critic_v_head = nn.ModuleList() + for _ in range(2): + self.critic_q_head.append( + nn.Sequential( + nn.Linear(critic_q_input_size, critic_head_hidden_size), activation, + RegressionHead( + critic_head_hidden_size, + 1, + critic_head_layer_num, + final_tanh=False, + activation=activation, + norm_type=norm_type + ) + ) + ) + self.critic_v_head = nn.Sequential( + nn.Linear(critic_v_input_size, critic_head_hidden_size), activation, + RegressionHead( + critic_head_hidden_size, + 1, + critic_head_layer_num, + final_tanh=False, + activation=activation, + norm_type=norm_type + ) + ) + else: + self.critic_q_head = nn.Sequential( + nn.Linear(critic_q_input_size, critic_head_hidden_size), activation, + RegressionHead( + critic_head_hidden_size, + 1, + critic_head_layer_num, + final_tanh=False, + activation=activation, + norm_type=norm_type + ) + ) + self.critic_v_head = nn.Sequential( + nn.Linear(critic_v_input_size, critic_head_hidden_size), activation, + RegressionHead( + critic_head_hidden_size, + 1, + critic_head_layer_num, + final_tanh=False, + activation=activation, + norm_type=norm_type + ) + ) + + # Convenient for calling some apis (e.g. self.critic.parameters()), + # but may cause misunderstanding when `print(self)` + self.actor = nn.ModuleList([self.actor_encoder, self.actor_head]) + self.critic = nn.ModuleList([self.critic_encoder, self.critic_q_head, self.critic_v_head]) + + def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: + """ + Overview: + QVAC forward computation graph, input observation tensor to predict Q-value or action logit. Different \ + ``mode`` will forward with different network modules to get different outputs and save computation. + Arguments: + - inputs (:obj:`Union[torch.Tensor, Dict[str, torch.Tensor]]`): The input data for forward computation \ + graph, for ``compute_actor``, it is the observation tensor, for ``compute_critic``, it is the \ + dict data including obs and action tensor. + - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class. + Returns: + - output (:obj:`Dict[str, torch.Tensor]`): The output dict of QVAC forward computation graph, whose \ + key-values vary in different forward modes. + Examples (Actor): + >>> # Regression mode + >>> model = ContinuousQVAC(64, 6, 'regression') + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['action'].shape == torch.Size([4, 6]) + >>> # Reparameterization Mode + >>> model = ContinuousQVAC(64, 6, 'reparameterization') + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 6]) # mu + >>> actor_outputs['logit'][1].shape == torch.Size([4, 6]) # sigma + + Examples (Critic): + >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} + >>> model = ContinuousQVAC(obs_shape=(8, ),action_shape=1, action_space='regression') + >>> assert model(inputs, mode='compute_critic')['q_value'].shape == (4, ) # q value + """ + assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) + return getattr(self, mode)(inputs) + + def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]: + """ + Overview: + QVAC forward computation graph for actor part, input observation tensor to predict action or action logit. + Arguments: + - x (:obj:`torch.Tensor`): The input observation tensor data. + Returns: + - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output dict varying \ + from action_space: ``regression``, ``reparameterization``, ``hybrid``. + ReturnsKeys (regression): + - action (:obj:`torch.Tensor`): Continuous action with same size as ``action_shape``, usually in DDPG/TD3. + ReturnsKeys (reparameterization): + - logit (:obj:`Dict[str, torch.Tensor]`): The predictd reparameterization action logit, usually in SAC. \ + It is a list containing two tensors: ``mu`` and ``sigma``. The former is the mean of the gaussian \ + distribution, the latter is the standard deviation of the gaussian distribution. + ReturnsKeys (hybrid): + - logit (:obj:`torch.Tensor`): The predicted discrete action type logit, it will be the same dimension \ + as ``action_type_shape``, i.e., all the possible discrete action types. + - action_args (:obj:`torch.Tensor`): Continuous action arguments with same size as ``action_args_shape``. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``. + - action (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. + - logit.mu (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. + - logit.sigma (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size. + - logit (:obj:`torch.Tensor`): :math:`(B, N2)`, B is batch size and N2 corresponds to \ + ``action_shape.action_type_shape``. + - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \ + ``action_shape.action_args_shape``. + Examples: + >>> # Regression mode + >>> model = ContinuousQVAC(64, 6, 'regression') + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['action'].shape == torch.Size([4, 6]) + >>> # Reparameterization Mode + >>> model = ContinuousQVAC(64, 6, 'reparameterization') + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 6]) # mu + >>> actor_outputs['logit'][1].shape == torch.Size([4, 6]) # sigma + """ + obs = self.actor_encoder(obs) + if self.action_space == 'regression': + x = self.actor_head(obs) + return {'action': x['pred']} + elif self.action_space == 'reparameterization': + x = self.actor_head(obs) + return {'logit': [x['mu'], x['sigma']]} + elif self.action_space == 'hybrid': + logit = self.actor_head[0](obs) + action_args = self.actor_head[1](obs) + return {'logit': logit['logit'], 'action_args': action_args['pred']} + + def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """ + Overview: + QVAC forward computation graph for critic part, input observation and action tensor to predict Q-value. + Arguments: + - inputs (:obj:`Dict[str, torch.Tensor]`): The dict of input data, including ``obs`` and ``action`` \ + tensor, also contains ``logit`` and ``action_args`` tensor in hybrid action_space. + ArgumentsKeys: + - obs: (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data. + - action (:obj:`Union[torch.Tensor, Dict]`): Continuous action with same size as ``action_shape``. + - logit (:obj:`torch.Tensor`): Discrete action logit, only in hybrid action_space. + - action_args (:obj:`torch.Tensor`): Continuous action arguments, only in hybrid action_space. + Returns: + - outputs (:obj:`Dict[str, torch.Tensor]`): The output dict of QVAC's forward computation graph for critic, \ + including ``q_value``. + ReturnKeys: + - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``. + - logit (:obj:`torch.Tensor`): :math:`(B, N2)`, B is batch size and N2 corresponds to \ + ``action_shape.action_type_shape``. + - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \ + ``action_shape.action_args_shape``. + - action (:obj:`torch.Tensor`): :math:`(B, N4)`, where B is batch size and N4 is ``action_shape``. + - q_value (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch size. + + Examples: + >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} + >>> model = ContinuousQVAC(obs_shape=(8, ),action_shape=1, action_space='regression') + >>> assert model(inputs, mode='compute_critic')['q_value'].shape == (4, ) # q value + """ + + obs, action = inputs['obs'], inputs['action'] + obs = self.critic_encoder(obs) + assert len(obs.shape) == 2 + if self.action_space == 'hybrid': + action_type_logit = inputs['logit'] + action_type_logit = torch.softmax(action_type_logit, dim=-1) + action_args = action['action_args'] + if len(action_args.shape) == 1: + action_args = action_args.unsqueeze(1) + x = torch.cat([obs, action_type_logit, action_args], dim=1) + else: + if len(action.shape) == 1: # (B, ) -> (B, 1) + action = action.unsqueeze(1) + x = torch.cat([obs, action], dim=1) + if self.twin_critic: + x = [m(x)['pred'] for m in self.critic_q_head] + y = self.critic_v_head(obs)['pred'] + else: + x = self.critic_q_head(x)['pred'] + y = self.critic_v_head(obs)['pred'] + return {'q_value': x, 'v_value': y} diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index 1289d8e6ca..acf224185b 100644 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -43,6 +43,7 @@ from .d4pg import D4PGPolicy from .cql import CQLPolicy, DiscreteCQLPolicy +from .iql import IQLPolicy from .dt import DTPolicy from .pdqn import PDQNPolicy from .madqn import MADQNPolicy @@ -322,6 +323,11 @@ class CQLCommandModePolicy(CQLPolicy, DummyCommandModePolicy): pass +@POLICY_REGISTRY.register('iql_command') +class IQLCommandModePolicy(IQLPolicy, DummyCommandModePolicy): + pass + + @POLICY_REGISTRY.register('discrete_cql_command') class DiscreteCQLCommandModePolicy(DiscreteCQLPolicy, EpsCommandModePolicy): pass diff --git a/ding/policy/iql.py b/ding/policy/iql.py new file mode 100644 index 0000000000..68a2620dd3 --- /dev/null +++ b/ding/policy/iql.py @@ -0,0 +1,646 @@ +from typing import List, Dict, Any, Tuple, Union +import copy +from collections import namedtuple +import numpy as np +import torch +import torch.nn.functional as F +from torch.distributions import Normal, Independent, TransformedDistribution +from torch.distributions.transforms import TanhTransform, AffineTransform + +from ding.torch_utils import Adam, to_device +from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, \ + qrdqn_nstep_td_data, qrdqn_nstep_td_error, get_nstep_return_data +from ding.model import model_wrap +from ding.utils import POLICY_REGISTRY +from ding.utils.data import default_collate, default_decollate +from .base_policy import Policy +from .common_utils import default_preprocess_learn + + +def asymmetric_l2_loss(u, tau): + return torch.mean(torch.abs(tau - (u < 0).float()) * u ** 2) + + +@POLICY_REGISTRY.register('iql') +class IQLPolicy(Policy): + """ + Overview: + Policy class of Implicit Q-Learning (IQL) algorithm for continuous control. Paper link: https://arxiv.org/abs/2110.06169. + + Config: + == ==================== ======== ============= ================================= ======================= + ID Symbol Type Default Value Description Other(Shape) + == ==================== ======== ============= ================================= ======================= + 1 ``type`` str iql | RL policy register name, refer | this arg is optional, + | to registry ``POLICY_REGISTRY`` | a placeholder + 2 ``cuda`` bool True | Whether to use cuda for network | + 3 | ``random_`` int 10000 | Number of randomly collected | Default to 10000 for + | ``collect_size`` | training samples in replay | SAC, 25000 for DDPG/ + | | buffer when training starts. | TD3. + 4 | ``model.policy_`` int 256 | Linear layer size for policy | + | ``embedding_size`` | network. | + 5 | ``model.soft_q_`` int 256 | Linear layer size for soft q | + | ``embedding_size`` | network. | + 6 | ``model.value_`` int 256 | Linear layer size for value | Defalut to None when + | ``embedding_size`` | network. | model.value_network + | | | is False. + 7 | ``learn.learning`` float 3e-4 | Learning rate for soft q | Defalut to 1e-3, when + | ``_rate_q`` | network. | model.value_network + | | | is True. + 8 | ``learn.learning`` float 3e-4 | Learning rate for policy | Defalut to 1e-3, when + | ``_rate_policy`` | network. | model.value_network + | | | is True. + 9 | ``learn.learning`` float 3e-4 | Learning rate for policy | Defalut to None when + | ``_rate_value`` | network. | model.value_network + | | | is False. + 10 | ``learn.alpha`` float 0.2 | Entropy regularization | alpha is initiali- + | | coefficient. | zation for auto + | | | `alpha`, when + | | | auto_alpha is True + 11 | ``learn.repara_`` bool True | Determine whether to use | + | ``meterization`` | reparameterization trick. | + 12 | ``learn.`` bool False | Determine whether to use | Temperature parameter + | ``auto_alpha`` | auto temperature parameter | determines the + | | `alpha`. | relative importance + | | | of the entropy term + | | | against the reward. + 13 | ``learn.-`` bool False | Determine whether to ignore | Use ignore_done only + | ``ignore_done`` | done flag. | in halfcheetah env. + 14 | ``learn.-`` float 0.005 | Used for soft update of the | aka. Interpolation + | ``target_theta`` | target network. | factor in polyak aver + | | | aging for target + | | | networks. + == ==================== ======== ============= ================================= ======================= + """ + + config = dict( + # (str) RL policy register name (refer to function "POLICY_REGISTRY"). + type='iql', + # (bool) Whether to use cuda for policy. + cuda=False, + # (bool) on_policy: Determine whether on-policy or off-policy. + # on-policy setting influences the behaviour of buffer. + on_policy=False, + # (bool) priority: Determine whether to use priority in buffer sample. + priority=False, + # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + priority_IS_weight=False, + # (int) Number of training samples(randomly collected) in replay buffer when training starts. + random_collect_size=10000, + model=dict( + # (bool type) twin_critic: Determine whether to use double-soft-q-net for target q computation. + # Please refer to TD3 about Clipped Double-Q Learning trick, which learns two Q-functions instead of one . + # Default to True. + twin_critic=True, + # (str type) action_space: Use reparameterization trick for continous action + action_space='reparameterization', + # (int) Hidden size for actor network head. + actor_head_hidden_size=512, + actor_head_layer_num=3, + # (int) Hidden size for critic network head. + critic_head_hidden_size=512, + critic_head_layer_num=2, + ), + # learn_mode config + learn=dict( + # (int) How many updates (iterations) to train after collector's one collection. + # Bigger "update_per_collect" means bigger off-policy. + update_per_collect=1, + # (int) Minibatch size for gradient descent. + batch_size=256, + # (float) learning_rate_q: Learning rate for soft q network. + learning_rate_q=3e-4, + # (float) learning_rate_policy: Learning rate for policy network. + learning_rate_policy=3e-4, + # (float) learning_rate_alpha: Learning rate for auto temperature parameter ``alpha``. + learning_rate_alpha=3e-4, + # (float) target_theta: Used for soft update of the target network, + # aka. Interpolation factor in polyak averaging for target networks. + target_theta=0.005, + # (float) discount factor for the discounted sum of rewards, aka. gamma. + discount_factor=0.99, + # (float) alpha: Entropy regularization coefficient. + # Please check out the original SAC paper (arXiv 1801.01290): Eq 1 for more details. + # If auto_alpha is set to `True`, alpha is initialization for auto `\alpha`. + # Default to 0.2. + alpha=0.2, + # (bool) auto_alpha: Determine whether to use auto temperature parameter `\alpha` . + # Temperature parameter determines the relative importance of the entropy term against the reward. + # Please check out the original SAC paper (arXiv 1801.01290): Eq 1 for more details. + # Default to False. + # Note that: Using auto alpha needs to set learning_rate_alpha in `cfg.policy.learn`. + auto_alpha=True, + # (bool) log_space: Determine whether to use auto `\alpha` in log space. + log_space=True, + # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum) + # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers. + # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000. + # However, interaction with HalfCheetah always gets done with done is False, + # Since we inplace done==True with done==False to keep + # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``), + # when the episode step is greater than max episode step. + ignore_done=False, + # (float) Weight uniform initialization range in the last output layer. + init_w=3e-3, + # (int) The numbers of action sample each at every state s from a uniform-at-random. + num_actions=10, + # (bool) Whether use lagrange multiplier in q value loss. + with_lagrange=False, + # (float) The threshold for difference in Q-values. + lagrange_thresh=-1, + # (float) Loss weight for conservative item. + min_q_weight=1.0, + # (float) coefficient for the asymmetric loss, range from [0.5, 1.0], default to 0.70. + tau=0.7, + # (float) temperature coefficient for Advantage Weighted Regression loss, default to 1.0. + beta=1.0, + ), + eval=dict(), # for compatibility + ) + + def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Return this algorithm default neural network model setting for demonstration. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names. + """ + + return 'continuous_qvac', ['ding.model.template.qvac'] + + def _init_learn(self) -> None: + """ + Overview: + Initialize the learn mode of policy, including related attributes and modules. For SAC, it mainly \ + contains three optimizers, algorithm-specific arguments such as gamma, min_q_weight, with_lagrange, \ + main and target model. Especially, the ``auto_alpha`` mechanism for balancing max entropy \ + target is also initialized here. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. + """ + self._priority = self._cfg.priority + self._priority_IS_weight = self._cfg.priority_IS_weight + self._twin_critic = self._cfg.model.twin_critic + self._num_actions = self._cfg.learn.num_actions + + self._min_q_version = 3 + self._min_q_weight = self._cfg.learn.min_q_weight + self._with_lagrange = self._cfg.learn.with_lagrange and (self._lagrange_thresh > 0) + self._lagrange_thresh = self._cfg.learn.lagrange_thresh + if self._with_lagrange: + self.target_action_gap = self._lagrange_thresh + self.log_alpha_prime = torch.tensor(0.).to(self._device).requires_grad_() + self.alpha_prime_optimizer = Adam( + [self.log_alpha_prime], + lr=self._cfg.learn.learning_rate_q, + ) + + # Weight Init + init_w = self._cfg.learn.init_w + self._model.actor_head[-1].mu.weight.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].mu.bias.data.uniform_(-init_w, init_w) + # self._model.actor_head[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w) + # self._model.actor_head[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w) + if self._twin_critic: + self._model.critic_q_head[0][-1].last.weight.data.uniform_(-init_w, init_w) + self._model.critic_q_head[0][-1].last.bias.data.uniform_(-init_w, init_w) + self._model.critic_q_head[1][-1].last.weight.data.uniform_(-init_w, init_w) + self._model.critic_q_head[1][-1].last.bias.data.uniform_(-init_w, init_w) + else: + self._model.critic_q_head[2].last.weight.data.uniform_(-init_w, init_w) + self._model.critic_q_head[-1].last.bias.data.uniform_(-init_w, init_w) + self._model.critic_v_head[2].last.weight.data.uniform_(-init_w, init_w) + self._model.critic_v_head[-1].last.bias.data.uniform_(-init_w, init_w) + + # Optimizers + self._optimizer_q = Adam( + self._model.critic.parameters(), + lr=self._cfg.learn.learning_rate_q, + ) + self._optimizer_policy = Adam( + self._model.actor.parameters(), + lr=self._cfg.learn.learning_rate_policy, + ) + + # Algorithm config + self._gamma = self._cfg.learn.discount_factor + + self._learn_model = model_wrap(self._model, wrapper_name='base') + self._learn_model.reset() + + self._forward_learn_cnt = 0 + + self._tau = self._cfg.learn.tau + self._beta = self._cfg.learn.beta + self._policy_start_training_counter = 10000 #300000 + + def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Overview: + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the offline dataset and then returns the output \ + result, including various training information such as loss, action, priority. + Arguments: + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For IQL, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \ + ``reward``, ``next_obs``, ``done``. Sometimes, it also contains other keys such as ``weight``. + Returns: + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + """ + loss_dict = {} + data = default_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=False + ) + if len(data.get('action').shape) == 1: + data['action'] = data['action'].reshape(-1, 1) + + if self._cuda: + data = to_device(data, self._device) + + self._learn_model.train() + obs = data['obs'] + next_obs = data['next_obs'] + reward = data['reward'] + done = data['done'] + + # 1. predict q and v value + value = self._learn_model.forward(data, mode='compute_critic') + q_value, v_value = value['q_value'], value['v_value'] + + # 2. predict target value + with torch.no_grad(): + (mu, sigma) = self._learn_model.forward(next_obs, mode='compute_actor')['logit'] + + next_obs_dist = TransformedDistribution( + Independent(Normal(mu, sigma), 1), + transforms=[TanhTransform(cache_size=1), + AffineTransform(loc=0.0, scale=1.05)] + ) + next_action = next_obs_dist.rsample() + next_log_prob = next_obs_dist.log_prob(next_action) + + next_data = {'obs': next_obs, 'action': next_action} + next_value = self._learn_model.forward(next_data, mode='compute_critic') + next_q_value, next_v_value = next_value['q_value'], next_value['v_value'] + + # the value of a policy according to the maximum entropy objective + if self._twin_critic: + next_q_value = torch.min(next_q_value[0], next_q_value[1]) + + # 3. compute v loss + if self._twin_critic: + q_value_min = torch.min(q_value[0], q_value[1]).detach() + v_loss = asymmetric_l2_loss(q_value_min - v_value, self._tau) + else: + advantage = q_value.detach() - v_value + v_loss = asymmetric_l2_loss(advantage, self._tau) + + # 4. compute q loss + if self._twin_critic: + q_data0 = v_1step_td_data(q_value[0], next_v_value, reward, done, data['weight']) + loss_dict['critic_q_loss'], td_error_per_sample0 = v_1step_td_error(q_data0, self._gamma) + q_data1 = v_1step_td_data(q_value[1], next_v_value, reward, done, data['weight']) + loss_dict['twin_critic_q_loss'], td_error_per_sample1 = v_1step_td_error(q_data1, self._gamma) + q_loss = (loss_dict['critic_q_loss'] + loss_dict['twin_critic_q_loss']) / 2 + else: + q_data = v_1step_td_data(q_value, next_v_value, reward, done, data['weight']) + loss_dict['critic_q_loss'], td_error_per_sample = v_1step_td_error(q_data, self._gamma) + q_loss = loss_dict['critic_q_loss'] + + # 5. update q and v network + self._optimizer_q.zero_grad() + v_loss.backward() + q_loss.backward() + self._optimizer_q.step() + + # 6. evaluate to get action distribution + (mu, sigma) = self._learn_model.forward(data['obs'], mode='compute_actor')['logit'] + + dist = TransformedDistribution( + Independent(Normal(mu, sigma), 1), + transforms=[TanhTransform(cache_size=1), AffineTransform(loc=0.0, scale=1.05)] + ) + action = data['action'] + log_prob = dist.log_prob(action) + + eval_data = {'obs': obs, 'action': action} + new_value = self._learn_model.forward(eval_data, mode='compute_critic') + new_q_value, new_v_value = new_value['q_value'], new_value['v_value'] + if self._twin_critic: + new_q_value = torch.min(new_q_value[0], new_q_value[1]) + new_advantage = new_q_value - new_v_value + + # 8. compute policy loss + policy_loss = (-log_prob * torch.exp(new_advantage.detach() / self._beta).clamp(max=20.0)).mean() + self._policy_start_training_counter -= 1 + + loss_dict['policy_loss'] = policy_loss + + # 9. update policy network + self._optimizer_policy.zero_grad() + policy_loss.backward() + policy_grad_norm = torch.nn.utils.clip_grad_norm_(self._model.actor.parameters(), 1) + self._optimizer_policy.step() + + loss_dict['total_loss'] = sum(loss_dict.values()) + + # ============= + # after update + # ============= + self._forward_learn_cnt += 1 + + return { + 'cur_lr_q': self._optimizer_q.defaults['lr'], + 'cur_lr_p': self._optimizer_policy.defaults['lr'], + 'priority': q_loss.abs().tolist(), + 'q_loss': q_loss.detach().mean().item(), + 'v_loss': v_loss.detach().mean().item(), + 'log_prob': log_prob.detach().mean().item(), + 'next_q_value': next_q_value.detach().mean().item(), + 'next_v_value': next_v_value.detach().mean().item(), + 'policy_loss': policy_loss.detach().mean().item(), + 'total_loss': loss_dict['total_loss'].detach().item(), + 'advantage_max': new_advantage.max().detach().item(), + 'new_q_value': new_q_value.detach().mean().item(), + 'new_v_value': new_v_value.detach().mean().item(), + 'policy_grad_norm': policy_grad_norm, + } + + def _get_policy_actions(self, data: Dict, num_actions: int = 10, epsilon: float = 1e-6) -> List: + # evaluate to get action distribution + obs = data['obs'] + obs = obs.unsqueeze(1).repeat(1, num_actions, 1).view(obs.shape[0] * num_actions, obs.shape[1]) + (mu, sigma) = self._learn_model.forward(obs, mode='compute_actor')['logit'] + dist = Independent(Normal(mu, sigma), 1) + pred = dist.rsample() + action = torch.tanh(pred) + + # evaluate action log prob depending on Jacobi determinant. + y = 1 - action.pow(2) + epsilon + log_prob = dist.log_prob(pred).unsqueeze(-1) + log_prob = log_prob - torch.log(y).sum(-1, keepdim=True) + + return action, log_prob.view(-1, num_actions, 1) + + def _get_q_value(self, data: Dict, keep: bool = True) -> torch.Tensor: + new_q_value = self._learn_model.forward(data, mode='compute_critic')['q_value'] + if self._twin_critic: + new_q_value = [value.view(-1, self._num_actions, 1) for value in new_q_value] + else: + new_q_value = new_q_value.view(-1, self._num_actions, 1) + if self._twin_critic and not keep: + new_q_value = torch.min(new_q_value[0], new_q_value[1]) + return new_q_value + + def _get_v_value(self, data: Dict, keep: bool = True) -> torch.Tensor: + new_v_value = self._learn_model.forward(data, mode='compute_critic')['v_value'] + if self._twin_critic: + new_v_value = [value.view(-1, self._num_actions, 1) for value in new_v_value] + else: + new_v_value = new_v_value.view(-1, self._num_actions, 1) + if self._twin_critic and not keep: + new_v_value = torch.min(new_v_value[0], new_v_value[1]) + return new_v_value + + def _init_collect(self) -> None: + """ + Overview: + Initialize the collect mode of policy, including related attributes and modules. For SAC, it contains the \ + collect_model other algorithm-specific arguments such as unroll_len. \ + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. + """ + self._unroll_len = self._cfg.collect.unroll_len + self._collect_model = model_wrap(self._model, wrapper_name='base') + self._collect_model.reset() + + def _forward_collect(self, data: Dict[int, Any], **kwargs) -> Dict[int, Any]: + """ + Overview: + Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \ + that the policy gets some necessary data (mainly observation) from the envs and then returns the output \ + data, such as the action to interact with the envs. + Arguments: + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. + Returns: + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \ + other necessary data for learn mode defined in ``self._process_transition`` method. The key of the \ + dict is the same as the input data, i.e. environment id. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + ``logit`` in SAC means the mu and sigma of Gaussioan distribution. Here we use this name for consistency. + + .. note:: + For more detailed examples, please refer to our unittest for SACPolicy: ``ding.policy.tests.test_sac``. + """ + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + self._collect_model.eval() + with torch.no_grad(): + (mu, sigma) = self._collect_model.forward(data, mode='compute_actor')['logit'] + dist = Independent(Normal(mu, sigma), 1) + action = torch.tanh(dist.rsample()) + output = {'logit': (mu, sigma), 'action': action} + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _process_transition(self, obs: torch.Tensor, policy_output: Dict[str, torch.Tensor], + timestep: namedtuple) -> Dict[str, torch.Tensor]: + """ + Overview: + Process and pack one timestep transition data into a dict, which can be directly used for training and \ + saved in replay buffer. For continuous SAC, it contains obs, next_obs, action, reward, done. The logit \ + will be also added when ``collector_logit`` is True. + Arguments: + - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. + - policy_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network with the observation \ + as input. For continuous SAC, it contains the action and the logit (mu and sigma) of the action. + - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step method, \ + except all the elements have been transformed into tensor data. Usually, it contains the next obs, \ + reward, done, info, etc. + Returns: + - transition (:obj:`Dict[str, torch.Tensor]`): The processed transition data of the current timestep. + """ + if self._cfg.collect.collector_logit: + transition = { + 'obs': obs, + 'next_obs': timestep.obs, + 'logit': policy_output['logit'], + 'action': policy_output['action'], + 'reward': timestep.reward, + 'done': timestep.done, + } + else: + transition = { + 'obs': obs, + 'next_obs': timestep.obs, + 'action': policy_output['action'], + 'reward': timestep.reward, + 'done': timestep.done, + } + return transition + + def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Overview: + For a given trajectory (transitions, a list of transition) data, process it into a list of sample that \ + can be used for training directly. In continuous SAC, a train sample is a processed transition \ + (unroll_len=1). + Arguments: + - transitions (:obj:`List[Dict[str, Any]`): The trajectory data (a list of transition), each element is \ + the same format as the return value of ``self._process_transition`` method. + Returns: + - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is the similar format \ + as input transitions, but may contain more data for training. + """ + return get_train_sample(transitions, self._unroll_len) + + def _init_eval(self) -> None: + """ + Overview: + Initialize the eval mode of policy, including related attributes and modules. For SAC, it contains the \ + eval model, which is equipped with ``base`` model wrapper to ensure compability. + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. + """ + self._eval_model = model_wrap(self._model, wrapper_name='base') + self._eval_model.reset() + + def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ + Overview: + Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \ + means that the policy gets some necessary data (mainly observation) from the envs and then returns the \ + action to interact with the envs. + Arguments: + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. + Returns: + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \ + key of the dict is the same as the input data, i.e. environment id. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + ``logit`` in SAC means the mu and sigma of Gaussioan distribution. Here we use this name for consistency. + + .. note:: + For more detailed examples, please refer to our unittest for SACPolicy: ``ding.policy.tests.test_sac``. + """ + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + self._eval_model.eval() + with torch.no_grad(): + (mu, sigma) = self._eval_model.forward(data, mode='compute_actor')['logit'] + action = torch.tanh(mu) / 1.05 # deterministic_eval + output = {'action': action} + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _monitor_vars_learn(self) -> List[str]: + """ + Overview: + Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \ + as text logger, tensorboard logger, will use these keys to save the corresponding data. + Returns: + - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged. + """ + twin_critic = ['twin_critic_loss'] if self._twin_critic else [] + return [ + 'cur_lr_q', + 'cur_lr_p', + 'value_loss' + 'policy_loss', + 'q_loss', + 'v_loss', + 'policy_loss', + 'log_prob', + 'total_loss', + 'advantage_max', + 'next_q_value', + 'next_v_value', + 'new_q_value', + 'new_v_value', + 'policy_grad_norm', + ] + twin_critic + + def _state_dict_learn(self) -> Dict[str, Any]: + """ + Overview: + Return the state_dict of learn mode, usually including model and optimizer. + Returns: + - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring. + """ + return { + 'model': self._learn_model.state_dict(), + 'optimizer_q': self._optimizer_q.state_dict(), + 'optimizer_policy': self._optimizer_policy.state_dict(), + } + + def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: + """ + Overview: + Load the state_dict variable into policy learn mode. + Arguments: + - state_dict (:obj:`Dict[str, Any]`): The dict of policy learn state saved before. + + .. tip:: + If you want to only load some parts of model, you can simply set the ``strict`` argument in \ + load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \ + complicated operation. + """ + self._learn_model.load_state_dict(state_dict['model']) + self._optimizer_q.load_state_dict(state_dict['optimizer_q']) + self._optimizer_policy.load_state_dict(state_dict['optimizer_policy']) diff --git a/ding/utils/data/dataset.py b/ding/utils/data/dataset.py index f29dd3335a..bd24e2eaf8 100755 --- a/ding/utils/data/dataset.py +++ b/ding/utils/data/dataset.py @@ -114,6 +114,38 @@ def __init__(self, cfg: dict) -> None: except (KeyError, AttributeError): # do not normalize pass + if hasattr(cfg.env, "reward_norm"): + if cfg.env.reward_norm == "normalize": + dataset['rewards'] = (dataset['rewards'] - dataset['rewards'].mean()) / dataset['rewards'].std() + elif cfg.env.reward_norm == "iql_antmaze": + dataset['rewards'] = dataset['rewards'] - 1.0 + elif cfg.env.reward_norm == "iql_locomotion": + + def return_range(dataset, max_episode_steps): + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + # returns.append(ep_ret) # incomplete trajectory + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + min_ret, max_ret = return_range(dataset, 1000) + dataset['rewards'] /= max_ret - min_ret + dataset['rewards'] *= 1000 + elif cfg.env.reward_norm == "cql_antmaze": + dataset['rewards'] = (dataset['rewards'] - 0.5) * 4.0 + elif cfg.env.reward_norm == "antmaze": + dataset['rewards'] = (dataset['rewards'] - 0.25) * 2.0 + else: + raise NotImplementedError + self._data = [] self._load_d4rl(dataset) diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_iql_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_iql_config.py new file mode 100644 index 0000000000..144feac1dd --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_iql_config.py @@ -0,0 +1,54 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_expert_iql_seed0", + env=dict( + env_id='halfcheetah-medium-expert-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/halfcheetah_medium_iql_config.py b/dizoo/d4rl/config/halfcheetah_medium_iql_config.py new file mode 100644 index 0000000000..545ecf970b --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_iql_config.py @@ -0,0 +1,54 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_iql_seed0", + env=dict( + env_id='halfcheetah-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/halfcheetah_medium_replay_iql_config.py b/dizoo/d4rl/config/halfcheetah_medium_replay_iql_config.py new file mode 100644 index 0000000000..d48a1fb472 --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_replay_iql_config.py @@ -0,0 +1,54 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_replay_iql_seed0", + env=dict( + env_id='halfcheetah-medium-replay-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/hopper_medium_expert_iql_config.py b/dizoo/d4rl/config/hopper_medium_expert_iql_config.py new file mode 100644 index 0000000000..6aef029c5e --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_expert_iql_config.py @@ -0,0 +1,54 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_expert_iql_seed0", + env=dict( + env_id='hopper-medium-expert-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/hopper_medium_iql_config.py b/dizoo/d4rl/config/hopper_medium_iql_config.py new file mode 100644 index 0000000000..8f429be268 --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_iql_config.py @@ -0,0 +1,54 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_iql_seed0", + env=dict( + env_id='hopper-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/hopper_medium_replay_iql_config.py b/dizoo/d4rl/config/hopper_medium_replay_iql_config.py new file mode 100644 index 0000000000..ad1b222843 --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_replay_iql_config.py @@ -0,0 +1,54 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_replay_iql_seed0", + env=dict( + env_id='hopper-medium-replay-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/walker2d_medium_expert_iql_config.py b/dizoo/d4rl/config/walker2d_medium_expert_iql_config.py new file mode 100644 index 0000000000..a6b4c7d1ac --- /dev/null +++ b/dizoo/d4rl/config/walker2d_medium_expert_iql_config.py @@ -0,0 +1,53 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="walker2d_medium_expert_iql_seed0", + env=dict( + env_id='walker2d-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/walker2d_medium_iql_config.py b/dizoo/d4rl/config/walker2d_medium_iql_config.py new file mode 100644 index 0000000000..7e6756dcc6 --- /dev/null +++ b/dizoo/d4rl/config/walker2d_medium_iql_config.py @@ -0,0 +1,53 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="walker2d_medium_iql_seed0", + env=dict( + env_id='walker2d-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/config/walker2d_medium_replay_iql_config.py b/dizoo/d4rl/config/walker2d_medium_replay_iql_config.py new file mode 100644 index 0000000000..5b43225448 --- /dev/null +++ b/dizoo/d4rl/config/walker2d_medium_replay_iql_config.py @@ -0,0 +1,53 @@ +# You can conduct Experiments on D4RL with this config file through the following command: +# cd ../entry && python d4rl_iql_main.py +from easydict import EasyDict + +main_config = dict( + exp_name="walker2d_medium_replay_iql_seed0", + env=dict( + env_id='walker2d-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=6000, + reward_norm="iql_locomotion", + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=4096, + learning_rate_q=3e-4, + learning_rate_policy=1e-4, + beta=0.05, + tau=0.7, + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='iql', + import_names=['ding.policy.iql'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config diff --git a/dizoo/d4rl/entry/d4rl_iql_main.py b/dizoo/d4rl/entry/d4rl_iql_main.py new file mode 100644 index 0000000000..ded097ee42 --- /dev/null +++ b/dizoo/d4rl/entry/d4rl_iql_main.py @@ -0,0 +1,21 @@ +from ding.entry import serial_pipeline_offline +from ding.config import read_config +from pathlib import Path + + +def train(args): + # launch from anywhere + config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = read_config(str(config)) + config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) + serial_pipeline_offline(config, seed=args.seed) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--seed', '-s', type=int, default=10) + parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_iql_config.py') + args = parser.parse_args() + train(args)