diff --git a/rl_games/algos_torch/a2c_continuous.py b/rl_games/algos_torch/a2c_continuous.py index e93ea362..72323846 100644 --- a/rl_games/algos_torch/a2c_continuous.py +++ b/rl_games/algos_torch/a2c_continuous.py @@ -6,7 +6,7 @@ from rl_games.common import datasets from torch import optim -import torch +import torch class A2CAgent(a2c_common.ContinuousA2CBase): @@ -34,7 +34,7 @@ def __init__(self, base_name, params): 'normalize_value' : self.normalize_value, 'normalize_input': self.normalize_input, } - + self.model = self.network.build(build_config) self.model.to(self.ppo_device) self.states = None @@ -74,7 +74,7 @@ def __init__(self, base_name, params): def update_epoch(self): self.epoch_num += 1 return self.epoch_num - + def save(self, fn): state = self.get_full_state_weights() torch_ext.save_checkpoint(fn, state) @@ -114,8 +114,8 @@ def calc_gradients(self, input_dict): batch_dict = { 'is_train': True, - 'prev_actions': actions_batch, - 'obs' : obs_batch, + 'prev_actions': actions_batch, + 'obs': obs_batch, } rnn_masks = None @@ -125,9 +125,9 @@ def calc_gradients(self, input_dict): batch_dict['seq_length'] = self.seq_length if self.zero_rnn_on_done: - batch_dict['dones'] = input_dict['dones'] + batch_dict['dones'] = input_dict['dones'] - with torch.cuda.amp.autocast(enabled=self.mixed_precision): + with torch.amp.autocast('cuda', enabled=self.mixed_precision): res_dict = self.model(batch_dict) action_log_probs = res_dict['prev_neglogp'] values = res_dict['values'] @@ -138,7 +138,7 @@ def calc_gradients(self, input_dict): a_loss = self.actor_loss_func(old_action_log_probs_batch, action_log_probs, advantage, self.ppo, curr_e_clip) if self.has_value_loss: - c_loss = common_losses.critic_loss(self.model,value_preds_batch, values, curr_e_clip, return_batch, self.clip_value) + c_loss = common_losses.critic_loss(self.model, value_preds_batch, values, curr_e_clip, return_batch, self.clip_value) else: c_loss = torch.zeros(1, device=self.ppo_device) if self.bound_loss_type == 'regularisation': @@ -183,7 +183,7 @@ def calc_gradients(self, input_dict): 'new_neglogp' : action_log_probs, 'old_neglogp' : old_action_log_probs_batch, 'masks' : rnn_masks - }, curr_e_clip, 0) + }, curr_e_clip, 0) self.train_result = (a_loss, c_loss, entropy, \ kl_dist, self.last_lr, lr_mul, \ @@ -209,5 +209,3 @@ def bound_loss(self, mu): else: b_loss = 0 return b_loss - - diff --git a/rl_games/algos_torch/network_builder.py b/rl_games/algos_torch/network_builder.py index 289812dd..d83dee4f 100644 --- a/rl_games/algos_torch/network_builder.py +++ b/rl_games/algos_torch/network_builder.py @@ -5,9 +5,9 @@ import torch.nn as nn from rl_games.algos_torch.d2rl import D2RLNet -from rl_games.algos_torch.sac_helper import SquashedNormal -from rl_games.common.layers.recurrent import GRUWithDones, LSTMWithDones -from rl_games.common.layers.value import TwoHotEncodedValue, DefaultValue +from rl_games.algos_torch.sac_helper import SquashedNormal +from rl_games.common.layers.recurrent import GRUWithDones, LSTMWithDones +from rl_games.common.layers.value import TwoHotEncodedValue, DefaultValue from rl_games.algos_torch.spatial_softmax import SpatialSoftArgmax @@ -192,7 +192,6 @@ def _build_value_layer(self, input_size, output_size, value_type='legacy'): raise ValueError('value type is not "default", "legacy" or "two_hot_encoded"') - class A2CBuilder(NetworkBuilder): def __init__(self, **kwargs): NetworkBuilder.__init__(self) @@ -339,7 +338,7 @@ def forward(self, obs_dict): a_out = a_out.contiguous().view(a_out.size(0), -1) c_out = self.critic_cnn(c_out) - c_out = c_out.contiguous().view(c_out.size(0), -1) + c_out = c_out.contiguous().view(c_out.size(0), -1) if self.has_rnn: seq_length = obs_dict.get('seq_length', 1) @@ -359,23 +358,23 @@ def forward(self, obs_dict): a_out = a_out.reshape(num_seqs, seq_length, -1) c_out = c_out.reshape(num_seqs, seq_length, -1) - a_out = a_out.transpose(0,1) - c_out = c_out.transpose(0,1) + a_out = a_out.transpose(0, 1) + c_out = c_out.transpose(0, 1) if dones is not None: dones = dones.reshape(num_seqs, seq_length, -1) - dones = dones.transpose(0,1) + dones = dones.transpose(0, 1) if len(states) == 2: a_states = states[0] c_states = states[1] else: a_states = states[:2] - c_states = states[2:] + c_states = states[2:] a_out, a_states = self.a_rnn(a_out, a_states, dones, bptt_len) c_out, c_states = self.c_rnn(c_out, c_states, dones, bptt_len) - a_out = a_out.transpose(0,1) - c_out = c_out.transpose(0,1) + a_out = a_out.transpose(0, 1) + c_out = c_out.transpose(0, 1) a_out = a_out.contiguous().reshape(a_out.size()[0] * a_out.size()[1], -1) c_out = c_out.contiguous().reshape(c_out.size()[0] * c_out.size()[1], -1) @@ -398,7 +397,7 @@ def forward(self, obs_dict): else: a_out = self.actor_mlp(a_out) c_out = self.critic_mlp(c_out) - + value = self.value_act(self.value(c_out)) if self.is_discrete: @@ -420,7 +419,7 @@ def forward(self, obs_dict): else: out = obs out = self.actor_cnn(out) - out = out.flatten(1) + out = out.flatten(1) if self.has_rnn: seq_length = obs_dict.get('seq_length', 1) @@ -474,7 +473,7 @@ def forward(self, obs_dict): else: sigma = self.sigma_act(self.sigma(out)) return mu, mu*0 + sigma, value, states - + def is_separate_critic(self): return self.separate @@ -555,6 +554,7 @@ def build(self, name, **kwargs): net = A2CBuilder.Network(self.params, **kwargs) return net + class Conv2dAuto(nn.Conv2d): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -623,6 +623,7 @@ def forward(self, x): x = self.res_block2(x) return x + class A2CResnetBuilder(NetworkBuilder): def __init__(self, **kwargs): NetworkBuilder.__init__(self) @@ -842,10 +843,10 @@ def is_rnn(self): def get_default_rnn_state(self): num_layers = self.rnn_layers if self.rnn_name == 'lstm': - return (torch.zeros((num_layers, self.num_seqs, self.rnn_units)), + return (torch.zeros((num_layers, self.num_seqs, self.rnn_units)), torch.zeros((num_layers, self.num_seqs, self.rnn_units))) else: - return (torch.zeros((num_layers, self.num_seqs, self.rnn_units))) + return (torch.zeros((num_layers, self.num_seqs, self.rnn_units))) def build(self, name, **kwargs): net = A2CResnetBuilder.Network(self.params, **kwargs) @@ -952,7 +953,7 @@ def __init__(self, params, **kwargs): self.critic = self._build_critic(1, **critic_mlp_args) print("Building Critic Target") self.critic_target = self._build_critic(1, **critic_mlp_args) - self.critic_target.load_state_dict(self.critic.state_dict()) + self.critic_target.load_state_dict(self.critic.state_dict()) mlp_init = self.init_factory.create(**self.initializer) for m in self.modules(): @@ -976,7 +977,7 @@ def forward(self, obs_dict): obs = obs_dict['obs'] mu, sigma = self.actor(obs) return mu, sigma - + def is_separate_critic(self): return self.separate @@ -997,7 +998,7 @@ def load(self, params): if self.has_space: self.is_discrete = 'discrete' in params['space'] - self.is_continuous = 'continuous'in params['space'] + self.is_continuous = 'continuous' in params['space'] if self.is_continuous: self.space_config = params['space']['continuous'] elif self.is_discrete: @@ -1005,4 +1006,3 @@ def load(self, params): else: self.is_discrete = False self.is_continuous = False - diff --git a/rl_games/common/a2c_common.py b/rl_games/common/a2c_common.py index 2ae2e544..ddd2982d 100644 --- a/rl_games/common/a2c_common.py +++ b/rl_games/common/a2c_common.py @@ -18,10 +18,10 @@ from datetime import datetime from tensorboardX import SummaryWriter -import torch +import torch from torch import nn import torch.distributed as dist - + from time import sleep from rl_games.common import common_losses @@ -36,6 +36,7 @@ def swap_and_flatten01(arr): s = arr.size() return arr.transpose(0, 1).reshape(s[0] * s[1], *s[2:]) + def rescale_actions(low, high, action): d = (high - low) / 2.0 m = (high + low) / 2.0 @@ -132,7 +133,7 @@ def __init__(self, base_name, params): self.vec_env = config.get('vec_env', None) self.ppo_device = config.get('device', 'cuda:0') - self.value_size = self.env_info.get('value_size',1) + self.value_size = self.env_info.get('value_size', 1) self.observation_space = self.env_info['observation_space'] self.weight_decay = config.get('weight_decay', 0.0) self.use_action_masks = config.get('use_action_masks', False) @@ -144,9 +145,9 @@ def __init__(self, base_name, params): if self.has_central_value: self.state_space = self.env_info.get('state_space', None) - if isinstance(self.state_space,gym.spaces.Dict): + if isinstance(self.state_space, gym.spaces.Dict): self.state_shape = {} - for k,v in self.state_space.spaces.items(): + for k, v in self.state_space.spaces.items(): self.state_shape[k] = v.shape else: self.state_shape = self.state_space.shape @@ -176,7 +177,7 @@ def __init__(self, base_name, params): self.scheduler = schedulers.AdaptiveScheduler(self.kl_threshold) elif self.linear_lr: - + if self.max_epochs == -1 and self.max_frames == -1: print("Max epochs and max frames are not set. Linear learning rate schedule can't be used, switching to the contstant (identity) one.") self.scheduler = schedulers.IdentityScheduler() @@ -190,7 +191,7 @@ def __init__(self, base_name, params): self.scheduler = schedulers.LinearScheduler(float(config['learning_rate']), max_steps = max_steps, - use_epochs = use_epochs, + use_epochs = use_epochs, apply_to_entropy = config.get('schedule_entropy', False), start_entropy_coef = config.get('entropy_coef')) else: @@ -220,11 +221,11 @@ def __init__(self, base_name, params): if isinstance(self.observation_space, gym.spaces.Dict): self.obs_shape = {} - for k,v in self.observation_space.spaces.items(): + for k, v in self.observation_space.spaces.items(): self.obs_shape[k] = v.shape else: self.obs_shape = self.observation_space.shape - + self.critic_coef = config['critic_coef'] self.grad_norm = config['grad_norm'] self.gamma = self.config['gamma'] @@ -257,7 +258,7 @@ def __init__(self, base_name, params): self.mini_epochs_num = self.config['mini_epochs'] self.mixed_precision = self.config.get('mixed_precision', False) - self.scaler = torch.cuda.amp.GradScaler(enabled=self.mixed_precision) + self.scaler = torch.amp.GradScaler('cuda', enabled=self.mixed_precision) self.last_lr = self.config['learning_rate'] self.frame = 0 @@ -369,7 +370,7 @@ def write_stats(self, total_time, epoch_num, step_time, play_time, update_time, self.writer.add_scalar('losses/c_loss', torch_ext.mean_list(c_losses).item(), frame) self.writer.add_scalar('losses/entropy', torch_ext.mean_list(entropies).item(), frame) - for k,v in self.aux_loss_dict.items(): + for k, v in self.aux_loss_dict.items(): self.writer.add_scalar('losses/' + k, torch_ext.mean_list(v).item(), frame) self.writer.add_scalar('info/last_lr', last_lr * lr_mul, frame) self.writer.add_scalar('info/lr_mul', lr_mul, frame) @@ -502,7 +503,7 @@ def obs_to_tensors(self, obs): upd_obs[key] = self._obs_to_tensors_internal(value) else: upd_obs = self.cast_obs(obs) - if not obs_is_dict or 'obs' not in obs: + if not obs_is_dict or 'obs' not in obs: upd_obs = {'obs' : upd_obs} return upd_obs @@ -567,7 +568,7 @@ def discount_values_masks(self, fdones, last_extrinsic_values, mb_fdones, mb_ext nextvalues = mb_extrinsic_values[t+1] nextnonterminal = nextnonterminal.unsqueeze(1) masks_t = mb_masks[t].unsqueeze(1) - delta = (mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_extrinsic_values[t]) + delta = (mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_extrinsic_values[t]) mb_advs[t] = lastgaelam = (delta + self.gamma * self.tau * nextnonterminal * lastgaelam) * masks_t return mb_advs @@ -576,7 +577,7 @@ def clear_stats(self): self.game_rewards.clear() self.game_shaped_rewards.clear() self.game_lengths.clear() - self.mean_rewards = self.last_mean_rewards = -100500 + self.mean_rewards = self.last_mean_rewards = -1000000000 self.algo_observer.after_clear_stats() def update_epoch(self): @@ -679,7 +680,7 @@ def set_weights(self, weights): def get_param(self, param_name): if param_name in [ "grad_norm", - "critic_coef", + "critic_coef", "bounds_loss_coef", "entropy_coef", "kl_threshold", @@ -692,7 +693,7 @@ def get_param(self, param_name): elif param_name == "learning_rate": return self.last_lr else: - raise NotImplementedError(f"Can't get param {param_name}") + raise NotImplementedError(f"Can't get param {param_name}") def set_param(self, param_name, param_value): if param_name in [ @@ -753,7 +754,7 @@ def play_steps(self): self.experience_buffer.update_data('dones', n, self.dones) for k in update_list: - self.experience_buffer.update_data(k, n, res_dict[k]) + self.experience_buffer.update_data(k, n, res_dict[k]) if self.has_central_value: self.experience_buffer.update_data('states', n, self.obs['states']) @@ -810,7 +811,7 @@ def play_steps_rnn(self): for n in range(self.horizon_length): if n % self.seq_length == 0: for s, mb_s in zip(self.rnn_states, mb_rnn_states): - mb_s[n // self.seq_length,:,:,:] = s + mb_s[n // self.seq_length, :, :, :] = s if self.has_central_value: self.central_value_net.pre_step_rnn(n) @@ -884,7 +885,7 @@ def play_steps_rnn(self): for mb_s in mb_rnn_states: t_size = mb_s.size()[0] * mb_s.size()[2] h_size = mb_s.size()[3] - states.append(mb_s.permute(1,2,0,3).reshape(-1,t_size, h_size)) + states.append(mb_s.permute(1, 2, 0, 3).reshape(-1, t_size, h_size)) batch_dict['rnn_states'] = states batch_dict['step_time'] = step_time @@ -896,7 +897,7 @@ class DiscreteA2CBase(A2CBase): def __init__(self, base_name, params): A2CBase.__init__(self, base_name, params) - + batch_size = self.num_agents * self.num_actors action_space = self.env_info['action_space'] if type(action_space) is gym.spaces.Discrete: @@ -982,7 +983,7 @@ def prepare_dataset(self, batch_dict): neglogpacs = batch_dict['neglogpacs'] dones = batch_dict['dones'] rnn_states = batch_dict.get('rnn_states', None) - + obses = batch_dict['obses'] advantages = returns - values @@ -991,7 +992,7 @@ def prepare_dataset(self, batch_dict): values = self.value_mean_std(values) returns = self.value_mean_std(returns) self.value_mean_std.eval() - + advantages = torch.sum(advantages, axis=1) if self.normalize_advantage: @@ -1034,7 +1035,7 @@ def prepare_dataset(self, batch_dict): def train(self): self.init_tensors() - self.mean_rewards = self.last_mean_rewards = -100500 + self.mean_rewards = self.last_mean_rewards = -1000000000 start_time = time.perf_counter() total_time = 0 rep_count = 0 @@ -1094,7 +1095,6 @@ def train(self): self.writer.add_scalar('shaped_' + rewards_name + '/iter'.format(i), mean_shaped_rewards[i], epoch_num) self.writer.add_scalar('shaped_' + rewards_name + '/time'.format(i), mean_shaped_rewards[i], total_time) - self.writer.add_scalar('episode_lengths/step', mean_lengths, frame) self.writer.add_scalar('episode_lengths/iter', mean_lengths, epoch_num) self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time) @@ -1317,7 +1317,7 @@ def prepare_dataset(self, batch_dict): def train(self): self.init_tensors() - self.last_mean_rewards = -100500 + self.last_mean_rewards = -1000000000 start_time = time.perf_counter() total_time = 0 rep_count = 0 diff --git a/rl_games/common/env_configurations.py b/rl_games/common/env_configurations.py index 08170847..f7d9f73d 100644 --- a/rl_games/common/env_configurations.py +++ b/rl_games/common/env_configurations.py @@ -10,7 +10,6 @@ import math - class HCRewardEnv(gym.RewardWrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) @@ -34,8 +33,6 @@ def step(self, action): return observation, reward, done, info - - class DMControlObsWrapper(gym.ObservationWrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) @@ -60,6 +57,7 @@ def create_default_gym_env(**kwargs): env = wrappers.LimitStepsWrapper(env) return env + def create_goal_gym_env(**kwargs): frames = kwargs.pop('frames', 1) name = kwargs.pop('name') @@ -72,7 +70,8 @@ def create_goal_gym_env(**kwargs): env = wrappers.FrameStack(env, frames, False) if limit_steps: env = wrappers.LimitStepsWrapper(env) - return env + return env + def create_slime_gym_env(**kwargs): import slimevolleygym @@ -86,6 +85,7 @@ def create_slime_gym_env(**kwargs): env = gym.make(name, **kwargs) return env + def create_myo(**kwargs): from myosuite.utils import gym name = kwargs.pop('name') @@ -93,6 +93,7 @@ def create_myo(**kwargs): env = wrappers.OldGymWrapper(env) return env + def create_atari_gym_env(**kwargs): #frames = kwargs.pop('frames', 1) name = kwargs.pop('name') @@ -100,7 +101,8 @@ def create_atari_gym_env(**kwargs): episode_life = kwargs.pop('episode_life',True) wrap_impala = kwargs.pop('wrap_impala', False) env = wrappers.make_atari_deepmind(name, skip=skip,episode_life=episode_life, wrap_impala=wrap_impala, **kwargs) - return env + return env + def create_dm_control_env(**kwargs): frames = kwargs.pop('frames', 1) @@ -113,6 +115,7 @@ def create_dm_control_env(**kwargs): env = wrappers.FrameStack(env, frames, False) return env + def create_super_mario_env(name='SuperMarioBros-v1'): import gym from nes_py.wrappers import JoypadSpace @@ -125,6 +128,7 @@ def create_super_mario_env(name='SuperMarioBros-v1'): env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) return env + def create_super_mario_env_stage1(name='SuperMarioBrosRandomStage1-v1'): import gym from nes_py.wrappers import JoypadSpace @@ -140,24 +144,27 @@ def create_super_mario_env_stage1(name='SuperMarioBrosRandomStage1-v1'): env = gym_super_mario_bros.make(stage_names[1]) env = JoypadSpace(env, SIMPLE_MOVEMENT) - + env = wrappers.MaxAndSkipEnv(env, skip=4) env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) #env = wrappers.AllowBacktracking(env) - + return env + def create_quadrupped_env(): import gym import roboschool import quadruppedEnv return wrappers.FrameStack(wrappers.MaxAndSkipEnv(gym.make('QuadruppedWalk-v1'), 4, False), 2, True) + def create_roboschool_env(name): import gym import roboschool return gym.make(name) + def create_smac(name, **kwargs): from rl_games.envs.smac_env import SMACEnv, MultiDiscreteSmacWrapper frames = kwargs.pop('frames', 1) @@ -166,8 +173,7 @@ def create_smac(name, **kwargs): has_cv = kwargs.get('central_value', False) as_single_agent = kwargs.pop('as_single_agent', False) env = SMACEnv(name, **kwargs) - - + if frames > 1: if has_cv: env = wrappers.BatchedFrameStackWithStates(env, frames, transpose=False, flatten=flatten) @@ -178,6 +184,7 @@ def create_smac(name, **kwargs): env = MultiDiscreteSmacWrapper(env) return env + def create_smac_v2(name, **kwargs): from rl_games.envs.smac_v2_env import SMACEnvV2 frames = kwargs.pop('frames', 1) @@ -185,7 +192,7 @@ def create_smac_v2(name, **kwargs): flatten = kwargs.pop('flatten', True) has_cv = kwargs.get('central_value', False) env = SMACEnvV2(name, **kwargs) - + if frames > 1: if has_cv: env = wrappers.BatchedFrameStackWithStates(env, frames, transpose=False, flatten=flatten) @@ -193,6 +200,7 @@ def create_smac_v2(name, **kwargs): env = wrappers.BatchedFrameStack(env, frames, transpose=False, flatten=flatten) return env + def create_smac_cnn(name, **kwargs): from rl_games.envs.smac_env import SMACEnv, MultiDiscreteSmacWrapper has_cv = kwargs.get('central_value', False) @@ -208,6 +216,7 @@ def create_smac_cnn(name, **kwargs): env = MultiDiscreteSmacWrapper(env) return env + def create_test_env(name, **kwargs): import rl_games.envs.test env = gym.make(name, **kwargs) @@ -217,7 +226,6 @@ def create_minigrid_env(name, **kwargs): import gym_minigrid import gym_minigrid.wrappers - state_bonus = kwargs.pop('state_bonus', False) action_bonus = kwargs.pop('action_bonus', False) rgb_fully_obs = kwargs.pop('rgb_fully_obs', False) @@ -225,7 +233,6 @@ def create_minigrid_env(name, **kwargs): view_size = kwargs.pop('view_size', 3) env = gym.make(name, **kwargs) - if state_bonus: env = gym_minigrid.wrappers.StateBonus(env) if action_bonus: @@ -241,17 +248,20 @@ def create_minigrid_env(name, **kwargs): print('minigird_env observation space shape:', env.observation_space) return env + def create_multiwalker_env(**kwargs): from rl_games.envs.multiwalker import MultiWalker - env = MultiWalker('', **kwargs) + env = MultiWalker('', **kwargs) return env + def create_diambra_env(**kwargs): from rl_games.envs.diambra.diambra import DiambraEnv env = DiambraEnv(**kwargs) return env + def create_env(name, **kwargs): steps_limit = kwargs.pop('steps_limit', None) env = gym.make(name, **kwargs) @@ -259,6 +269,7 @@ def create_env(name, **kwargs): env = wrappers.TimeLimit(env, steps_limit) return env + # Dictionary of env_name as key and a sub-dict containing env_type and a env-creator function configurations = { 'CartPole-v1' : { @@ -450,16 +461,17 @@ def get_env_info(env): ''' if isinstance(result_shapes['observation_space'], gym.spaces.dict.Dict): result_shapes['observation_space'] = observation_space['observations'] - + if isinstance(result_shapes['observation_space'], dict): result_shapes['observation_space'] = observation_space['observations'] result_shapes['state_space'] = observation_space['states'] ''' - if hasattr(env, "value_size"): + if hasattr(env, "value_size"): result_shapes['value_size'] = env.value_size print(result_shapes) return result_shapes + def get_obs_and_action_spaces_from_config(config): env_config = config.get('env_config', {}) env = configurations[config['env_name']]['env_creator'](**env_config) @@ -476,4 +488,4 @@ def register(name, config): config (:obj:`dict`): Dictionary with env type and a creator function. """ - configurations[name] = config \ No newline at end of file + configurations[name] = config diff --git a/rl_games/common/experience.py b/rl_games/common/experience.py index feea017c..8be1da35 100644 --- a/rl_games/common/experience.py +++ b/rl_games/common/experience.py @@ -7,6 +7,7 @@ from rl_games.algos_torch.torch_ext import numpy_to_torch_dtype_dict + class ReplayBuffer(object): def __init__(self, size, ob_space): """Create Replay buffer. @@ -222,7 +223,6 @@ def __init__(self, obs_shape, action_shape, capacity, device): self.capacity = capacity self.idx = 0 self.full = False - def add(self, obs, action, reward, next_obs, done): @@ -268,7 +268,7 @@ def sample(self, batch_size): """ idxs = torch.randint(0, - self.capacity if self.full else self.idx, + self.capacity if self.full else self.idx, (batch_size,), device=self.device) obses = self.obses[idxs] actions = self.actions[idxs] @@ -279,9 +279,6 @@ def sample(self, batch_size): return obses, actions, rewards, next_obses, dones - - - class ExperienceBuffer: ''' More generalized than replay buffers. @@ -294,7 +291,7 @@ def __init__(self, env_info, algo_info, device, aux_tensor_dict=None): self.num_agents = env_info.get('agents', 1) self.action_space = env_info['action_space'] - + self.num_actors = algo_info['num_actors'] self.horizon_length = algo_info['horizon_length'] self.has_central_value = algo_info['has_central_value'] @@ -314,7 +311,7 @@ def __init__(self, env_info, algo_info, device, aux_tensor_dict=None): self.actions_num = [action.n for action in self.action_space] self.is_multi_discrete = True if type(self.action_space) is gym.spaces.Box: - self.actions_shape = (self.action_space.shape[0],) + self.actions_shape = (self.action_space.shape[0],) self.actions_num = self.action_space.shape[0] self.is_continuous = True self.tensor_dict = {} @@ -331,7 +328,7 @@ def _init_from_env_info(self, env_info): self.tensor_dict['obses'] = self._create_tensor_from_space(env_info['observation_space'], obs_base_shape) if self.has_central_value: self.tensor_dict['states'] = self._create_tensor_from_space(env_info['state_space'], state_base_shape) - + val_space = gym.spaces.Box(low=0, high=1,shape=(env_info.get('value_size',1),)) self.tensor_dict['rewards'] = self._create_tensor_from_space(val_space, obs_base_shape) self.tensor_dict['values'] = self._create_tensor_from_space(val_space, obs_base_shape) @@ -349,55 +346,54 @@ def _init_from_env_info(self, env_info): def _init_from_aux_dict(self, tensor_dict): obs_base_shape = self.obs_base_shape - for k,v in tensor_dict.items(): + for k, v in tensor_dict.items(): self.tensor_dict[k] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=(v), dtype=np.float32), obs_base_shape) def _create_tensor_from_space(self, space, base_shape): if type(space) is gym.spaces.Box: dtype = numpy_to_torch_dtype_dict[space.dtype] - return torch.zeros(base_shape + space.shape, dtype= dtype, device = self.device) + return torch.zeros(base_shape + space.shape, dtype=dtype, device=self.device) if type(space) is gym.spaces.Discrete: dtype = numpy_to_torch_dtype_dict[space.dtype] - return torch.zeros(base_shape, dtype= dtype, device = self.device) + return torch.zeros(base_shape, dtype=dtype, device=self.device) if type(space) is gym.spaces.Tuple: ''' assuming that tuple is only Discrete tuple ''' dtype = numpy_to_torch_dtype_dict[space.dtype] tuple_len = len(space) - return torch.zeros(base_shape +(tuple_len,), dtype= dtype, device = self.device) + return torch.zeros(base_shape + (tuple_len,), dtype=dtype, device=self.device) if type(space) is gym.spaces.Dict: t_dict = {} - for k,v in space.spaces.items(): + for k, v in space.spaces.items(): t_dict[k] = self._create_tensor_from_space(v, base_shape) return t_dict def update_data(self, name, index, val): if type(val) is dict: - for k,v in val.items(): - self.tensor_dict[name][k][index,:] = v + for k, v in val.items(): + self.tensor_dict[name][k][index, :] = v else: - self.tensor_dict[name][index,:] = val - + self.tensor_dict[name][index, :] = val - def update_data_rnn(self, name, indices,play_mask, val): + def update_data_rnn(self, name, indices, play_mask, val): if type(val) is dict: for k,v in val: - self.tensor_dict[name][k][indices,play_mask] = v + self.tensor_dict[name][k][indices, play_mask] = v else: - self.tensor_dict[name][indices,play_mask] = val + self.tensor_dict[name][indices, play_mask] = val def get_transformed(self, transform_op): res_dict = {} for k, v in self.tensor_dict.items(): if type(v) is dict: transformed_dict = {} - for kd,vd in v.items(): + for kd, vd in v.items(): transformed_dict[kd] = transform_op(vd) res_dict[k] = transformed_dict else: res_dict[k] = transform_op(v) - + return res_dict def get_transformed_list(self, transform_op, tensor_list): @@ -408,10 +404,10 @@ def get_transformed_list(self, transform_op, tensor_list): continue if type(v) is dict: transformed_dict = {} - for kd,vd in v.items(): + for kd, vd in v.items(): transformed_dict[kd] = transform_op(vd) res_dict[k] = transformed_dict else: res_dict[k] = transform_op(v) - + return res_dict diff --git a/rl_games/common/vecenv.py b/rl_games/common/vecenv.py index c29fd4be..af9c3971 100644 --- a/rl_games/common/vecenv.py +++ b/rl_games/common/vecenv.py @@ -115,12 +115,15 @@ def get_env_info(self): class RayVecEnv(IVecEnv): """Main env class that manages several `rl_games.common.vecenv.Rayworker` objects for parallel training - + The RayVecEnv class manages a set of individual environments and wraps around the methods from RayWorker. Each worker is executed asynchronously. """ - import ray + try: + import ray + except ImportError: + pass def __init__(self, config_name, num_actors, **kwargs): """Initialise the class. Sets up the config for the environment and creates individual workers to manage. diff --git a/rl_games/common/wrappers.py b/rl_games/common/wrappers.py index dab4a648..ed657d26 100644 --- a/rl_games/common/wrappers.py +++ b/rl_games/common/wrappers.py @@ -1,4 +1,3 @@ -import gymnasium import numpy as np from numpy.random import randint @@ -11,11 +10,10 @@ from copy import copy - class InfoWrapper(gym.Wrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) - + self.reward = 0 def reset(self, **kwargs): self.reward = 0 @@ -149,8 +147,8 @@ def __init__(self, env,skip=4, use_max = True): self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) else: self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.float32) - self._skip = skip - + self._skip = skip + def step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 @@ -262,7 +260,7 @@ def _get_ob(self): class BatchedFrameStack(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.frames = deque([], maxlen=k) @@ -303,8 +301,9 @@ def _get_ob(self): frames = np.transpose(self.frames, (1, 0, 2)) return frames + class BatchedFrameStackWithStates(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.obses = deque([], maxlen=k) @@ -346,7 +345,7 @@ def _get_ob(self): assert len(self.obses) == self.k obses = self.process_data(self.obses) states = self.process_data(self.states) - return {"obs": obses, "state" : states} + return {"obs": obses, "state": states} def process_data(self, data): if len(np.shape(data)) < 3: @@ -363,14 +362,15 @@ def process_data(self, data): obses = np.transpose(data, (1, 0, 2)) return obses + class ProcgenStack(gym.Wrapper): - def __init__(self, env, k = 2, greyscale=True): + def __init__(self, env, k=2, greyscale=True): gym.Wrapper.__init__(self, env) self.k = k self.curr_frame = 0 self.frames = deque([], maxlen=k) - self.greyscale=greyscale + self.greyscale = greyscale self.prev_frame = None shp = env.observation_space.shape if greyscale: @@ -421,6 +421,7 @@ def observation(self, observation): # with smaller replay buffers only. return np.array(observation).astype(np.float32) / 255.0 + class LazyFrames(object): def __init__(self, frames): """This object ensures that common frames between the observations are only stored once. @@ -449,6 +450,7 @@ def __len__(self): def __getitem__(self, i): return self._force()[i] + class ReallyDoneWrapper(gym.Wrapper): def __init__(self, env): """ @@ -471,6 +473,7 @@ def step(self, action): done = lives == 0 return obs, reward, done, info + class AllowBacktracking(gym.Wrapper): """ Use deltas in max(X) as the reward, rather than deltas @@ -506,6 +509,7 @@ def unwrap(env): else: return env + class StickyActionEnv(gym.Wrapper): def __init__(self, env, p=0.25): super(StickyActionEnv, self).__init__(env) @@ -627,15 +631,22 @@ def __init__(self, env, name): def observation(self, observation): return observation * self.mask + class OldGymWrapper(gym.Env): def __init__(self, env): self.env = env + import gymnasium + + # Convert Gymnasium spaces to Gym spaces self.observation_space = self.convert_space(env.observation_space) self.action_space = self.convert_space(env.action_space) def convert_space(self, space): + import gymnasium + + """Recursively convert Gymnasium spaces to Gym spaces.""" if isinstance(space, gymnasium.spaces.Box): return gym.spaces.Box( @@ -691,6 +702,7 @@ def render(self, mode='human'): def close(self): return self.env.close() + # Example usage: if __name__ == "__main__": # Create a MyoSuite environment @@ -719,7 +731,7 @@ def make_atari(env_id, timelimit=True, noop_max=0, skip=4, sticky=False, directo env = StickyActionEnv(env) env = InfoWrapper(env) if directory != None: - env = gym.wrappers.Monitor(env,directory=directory,force=True) + env = gym.wrappers.Monitor(env, directory=directory, force=True) if sticky: env = StickyActionEnv(env) if not timelimit: