From b6769612dcbf90527aba144a933a4871ad99c9ec Mon Sep 17 00:00:00 2001 From: Justin Lidard <60638575+jlidard@users.noreply.github.com> Date: Fri, 27 Sep 2024 12:39:32 -0400 Subject: [PATCH] Ibrl (#2) * remove dataset consistency check * add pretrain configs * rename * transport pretrain cfg * add ibrl * fix base policy * set `deterministic=True` when sampling in diffusion evaluation * minors * Revert "add rlpd framework" * Revert "Revert "add rlpd framework"" (#4) * match rlpd param names * rename to `StitchedSequenceQLearningDataset` * add configs * add `tanh_output` and dropout to gaussians * fix ibrl * minors --------- Co-authored-by: Justin M. Lidard Co-authored-by: allenzren --- agent/finetune/train_ibrl_agent.py | 334 ++++++++++++++++++ agent/finetune/train_sac_agent.py | 18 +- cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml | 54 +++ cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml | 107 ++++++ cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml | 98 +++++ cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml | 108 ++++++ cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml | 2 +- cfg/gym/finetune/hopper-v2/sac_mlp.yaml | 3 +- .../pre_gaussian_mlp.yaml | 61 ++++ .../hopper-medium-v2/pre_gaussian_mlp.yaml | 60 ++++ .../walker2d-medium-v2/pre_gaussian_mlp.yaml | 60 ++++ cfg/robomimic/finetune/square/rlpd_mlp.yaml | 114 ++++++ .../pretrain/can/pre_gaussian_mlp_ibrl.yaml | 59 ++++ .../pretrain/lift/pre_gaussian_mlp_ibrl.yaml | 60 ++++ .../square/pre_gaussian_mlp_ibrl.yaml | 60 ++++ .../transport/pre_gaussian_mlp_ibrl.yaml | 60 ++++ model/common/gaussian.py | 14 +- model/common/mlp.py | 58 ++- model/common/mlp_gaussian.py | 10 +- model/rl/gaussian_ibrl.py | 193 ++++++++++ model/rl/gaussian_rlpd.py | 20 +- model/rl/gaussian_sac.py | 17 - script/dataset/README.md | 16 +- script/dataset/get_d4rl_dataset.py | 8 +- 24 files changed, 1502 insertions(+), 92 deletions(-) create mode 100644 agent/finetune/train_ibrl_agent.py create mode 100644 cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml create mode 100644 cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml create mode 100644 cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml create mode 100644 cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml create mode 100644 cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml create mode 100644 cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml create mode 100644 cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml create mode 100644 cfg/robomimic/finetune/square/rlpd_mlp.yaml create mode 100644 cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml create mode 100644 cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml create mode 100644 cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml create mode 100644 cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml create mode 100644 model/rl/gaussian_ibrl.py diff --git a/agent/finetune/train_ibrl_agent.py b/agent/finetune/train_ibrl_agent.py new file mode 100644 index 0000000..332db5c --- /dev/null +++ b/agent/finetune/train_ibrl_agent.py @@ -0,0 +1,334 @@ +""" +Imitation Bootstrapped Reinforcement Learning (IBRL) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +import hydra +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainIBRLAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + + # Build dataset + self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.AdamW( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + weight_decay=cfg.train.actor_weight_decay, + ) + self.actor_lr_scheduler = CosineAnnealingWarmupRestarts( + self.actor_optimizer, + first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.actor_lr, + min_lr=cfg.train.actor_lr_scheduler.min_lr, + warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps, + gamma=1.0, + ) + self.critic_optimizer = torch.optim.AdamW( + self.model.critic_networks.parameters(), + lr=cfg.train.critic_lr, + weight_decay=cfg.train.critic_weight_decay, + ) + self.critic_lr_scheduler = CosineAnnealingWarmupRestarts( + self.critic_optimizer, + first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.critic_lr, + min_lr=cfg.train.critic_lr_scheduler.min_lr, + warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Number of critic updates + self.critic_num_update = cfg.train.critic_num_update + + # Update frequency + self.update_freq = cfg.train.update_freq + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + done_buffer = deque(maxlen=self.buffer_size) + + # collect the offline dataset + states = self.dataset_offline.states + next_states = torch.roll(states, shifts=1, dims=0) + next_states[0] = 0 + actions = self.dataset_offline.actions + rewards = self.dataset_offline.rewards + dones = self.dataset_offline.dones + + # initailize the replay buffer with offline data + obs_buffer.extend(states[: self.buffer_size, None].cpu().numpy()) + next_obs_buffer.extend(next_states[: self.buffer_size, None].cpu().numpy()) + action_buffer.extend(actions[: self.buffer_size, None].cpu().numpy()) + reward_buffer.extend(rewards[: self.buffer_size].cpu().numpy()) + done_buffer.extend(dones[: self.buffer_size].cpu().numpy()) + + # Start training loop + timer = Timer() + run_results = [] + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr > self.n_explore_steps + and not self.force_train + ) + n_steps = ( + self.n_steps if not eval_mode else int(1e5) + ) # large number for eval mode + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.empty((0, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs = np.vstack((firsts_trajs, np.ones((1, self.n_envs)))) + else: + # if done at the end of last iteration, then the envs are just reset + firsts_trajs = np.vstack((firsts_trajs, done_venv)) + reward_trajs = np.empty((0, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for _ in range(n_steps): + + # Select action + if self.itr < self.n_explore_steps: + action_venv = self.venv.action_space.sample() + else: + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + obs_venv, reward_venv, done_venv, info_venv = self.venv.step( + action_venv + ) + reward_trajs = np.vstack((reward_trajs, reward_venv[None])) + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.append(reward_venv[i] * self.scale_reward_factor) + done_buffer.append(done_venv[i]) + firsts_trajs = np.vstack( + (firsts_trajs, done_venv) + ) # offset by one step + prev_obs_venv = obs_venv + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if ( + not eval_mode + and self.itr > self.n_explore_steps + and self.itr % self.update_freq == 0 + ): + obs_array = np.array(obs_buffer) + next_obs_array = np.array(next_obs_buffer) + actions_array = np.array(action_buffer) + rewards_array = np.array(reward_buffer) + dones_array = np.array(done_buffer) + + # Update critic more frequently + for _ in range(self.critic_num_update): + # Sample from online buffer + inds = np.random.choice(len(obs_buffer), self.batch_size) + obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device) + next_obs_b = ( + torch.from_numpy(next_obs_array[inds]).float().to(self.device) + ) + actions_b = ( + torch.from_numpy(actions_array[inds]).float().to(self.device) + ) + rewards_b = ( + torch.from_numpy(rewards_array[inds]).float().to(self.device) + ) + dones_b = ( + torch.from_numpy(dones_array[inds]).float().to(self.device) + ) + # Update critic + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + rewards_b, + dones_b, + self.gamma, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic every critic update + self.model.update_target_critic(self.target_ema_rate) + + # Update actor once with the final batch + loss_actor = self.model.loss_actor( + {"state": obs_b}, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + self.actor_optimizer.step() + + # Update target actor + self.model.update_target_actor(self.target_ema_rate) + + # Update lr + self.actor_lr_scheduler.step() + self.critic_lr_scheduler.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append({"itr": self.itr}) + if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps: + time = timer() + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "loss - actor": loss_actor, + "loss - critic": loss_critic, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + }, + step=self.itr, + commit=True, + ) + run_results[-1]["loss_actor"] = loss_actor + run_results[-1]["loss_critic"] = loss_critic + run_results[-1]["train_episode_reward"] = avg_episode_reward + run_results[-1]["time"] = time + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/agent/finetune/train_sac_agent.py b/agent/finetune/train_sac_agent.py index 27dccc1..cc4798e 100644 --- a/agent/finetune/train_sac_agent.py +++ b/agent/finetune/train_sac_agent.py @@ -62,8 +62,12 @@ def __init__(self, cfg): self.scale_reward_factor = cfg.train.scale_reward_factor # Actor/critic update frequency - assume single env - self.critic_update_freq = int(cfg.train.batch_size / cfg.train.critic_replay_ratio) - self.actor_update_freq = int(cfg.train.batch_size / cfg.train.actor_replay_ratio) + self.critic_update_freq = int( + cfg.train.batch_size / cfg.train.critic_replay_ratio + ) + self.actor_update_freq = int( + cfg.train.batch_size / cfg.train.actor_replay_ratio + ) # Buffer size self.buffer_size = cfg.train.buffer_size @@ -215,10 +219,12 @@ def run(self): success_rate = 0 # Update models - if not eval_mode and self.itr > self.n_explore_steps and self.itr % self.critic_update_freq == 0: - inds = np.random.choice( - len(obs_buffer), self.batch_size, replace=False - ) + if ( + not eval_mode + and self.itr > self.n_explore_steps + and self.itr % self.critic_update_freq == 0 + ): + inds = np.random.choice(len(obs_buffer), self.batch_size, replace=False) obs_b = ( torch.from_numpy(np.array([obs_buffer[i] for i in inds])) .float() diff --git a/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml b/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml new file mode 100644 index 0000000..9759941 --- /dev/null +++ b/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml @@ -0,0 +1,54 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent + +name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation. +render_num: 0 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +model: + _target_: model.common.gaussian.GaussianModel + # + network_path: ${base_policy_path} + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml new file mode 100644 index 0000000..dcdef35 --- /dev/null +++ b/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml @@ -0,0 +1,107 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 250000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 50000 + val_freq: 1000 + render: + freq: 1 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.005 + scale_reward_factor: 0.1 + critic_num_update: 2 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 2000 # used in RLPD locomotion + target_entropy: ${eval:'- ${action_dim} * ${act_steps} / 2'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + backup_entropy: True + n_critics: 5 # Ensemble size for critic models + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${action_dim} + std_max: 7.3891 + std_min: 2.061e-9 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml new file mode 100644 index 0000000..622c5ba --- /dev/null +++ b/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml @@ -0,0 +1,98 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_sac_agent.TrainSACAgent + +name: ${env_name}_sac_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: sac-gym-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 1 + num: 0 + log_freq: 200 + # SAC specific + batch_size: 128 + target_ema_rate: 0.005 + scale_reward_factor: 0.1 + critic_replay_ratio: 128 + actor_replay_ratio: 64 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 5000 + target_entropy: ${eval:'- ${action_dim} * ${act_steps} / 2'} + init_temperature: 0.2 + +model: + _target_: model.rl.gaussian_sac.SAC_Gaussian + randn_clip_value: 10 + tanh_output: True # squash after sampling + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${action_dim} + std_max: 7.3891 + std_min: 2.061e-9 + critic: # no layernorm + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} diff --git a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml b/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml new file mode 100644 index 0000000..8c2e601 --- /dev/null +++ b/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml @@ -0,0 +1,108 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_gaussian_mlp_ta1/2024-09-25_14-57-07_42/checkpoint/state_1000.pt +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-gym-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 250000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 0.1 + critic_num_update: 5 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + network_path: ${base_policy_path} + randn_clip_value: 3 + n_critics: 5 # Ensemble size for critic models + soft_action_sample: True + soft_action_sample_beta: 0.1 + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${action_dim} + fixed_std: 0.1 + # dropout: 0.5 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml b/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml index 8b9ec1f..f40e884 100644 --- a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml @@ -55,7 +55,7 @@ train: first_cycle_steps: 1000 warmup_steps: 10 min_lr: 3e-4 - save_model_freq: 100000 + save_model_freq: 50000 val_freq: 5000 render: freq: 1 diff --git a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml index 560f37f..803d334 100644 --- a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml @@ -36,7 +36,7 @@ env: wandb: entity: ${oc.env:DPPO_WANDB_ENTITY} - project: sac-gym-${env_name}-finetune + project: sac-gym-${env_name} run: ${now:%H-%M-%S}_${name} train: @@ -85,7 +85,6 @@ model: transition_dim: ${action_dim} std_max: 7.3891 std_min: 2.061e-9 - # fixed_std: 0.1 critic: # no layernorm _target_: model.common.critic.CriticObsAct mlp_dims: [256, 256] diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..5c4fb7f --- /dev/null +++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml @@ -0,0 +1,61 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:1 +env: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +transition_dim: ${action_dim} +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain-gaussian + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 3000 + batch_size: 128 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 3000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 100 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${transition_dim} + fixed_std: 0.1 + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..9a97086 --- /dev/null +++ b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +transition_dim: ${action_dim} +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 500 + batch_size: 128 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 100 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${transition_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..3917aff --- /dev/null +++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:1 +env: walker2d-medium-v2 +obs_dim: 17 +action_dim: 6 +transition_dim: ${action_dim} +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain-gaussian + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 3000 + batch_size: 128 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 3000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 100 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${transition_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/rlpd_mlp.yaml b/cfg/robomimic/finetune/square/rlpd_mlp.yaml new file mode 100644 index 0000000..a34d26a --- /dev/null +++ b/cfg/robomimic/finetune/square/rlpd_mlp.yaml @@ -0,0 +1,114 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 400 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 250000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.005 + scale_reward_factor: 0.1 + critic_num_update: 20 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 5000 + target_entropy: ${eval:'- ${action_dim} * ${act_steps} / 2'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + backup_entropy: True + n_critics: 10 # Ensemble size for critic models + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${action_dim} + std_max: 7.3891 + std_min: 2.061e-9 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + residual_style: False + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..9bd074d --- /dev/null +++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: can +obs_dim: 23 +action_dim: 7 +transition_dim: ${action_dim} +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain-gaussian + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${transition_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..69a49ae --- /dev/null +++ b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: lift +obs_dim: 19 +action_dim: 7 +transition_dim: ${action_dim} +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${transition_dim} + horizon_steps: ${horizon_steps} + device: ${device} + + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..c78661f --- /dev/null +++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: square +obs_dim: 23 +action_dim: 7 +transition_dim: ${action_dim} +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${transition_dim} + horizon_steps: ${horizon_steps} + device: ${device} + + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..e9ca5f7 --- /dev/null +++ b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: transport +obs_dim: 59 +action_dim: 14 +transition_dim: ${action_dim} +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + transition_dim: ${transition_dim} + horizon_steps: ${horizon_steps} + device: ${device} + + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/model/common/gaussian.py b/model/common/gaussian.py index d7725fd..5246f38 100644 --- a/model/common/gaussian.py +++ b/model/common/gaussian.py @@ -19,6 +19,7 @@ def __init__( network_path=None, device="cuda:0", randn_clip_value=10, + tanh_output=False, ): super().__init__() self.device = device @@ -29,7 +30,7 @@ def __init__( ) self.load_state_dict( checkpoint["model"], - strict=False, + strict=True, ) log.info("Loaded actor from %s", network_path) log.info( @@ -40,12 +41,16 @@ def __init__( # Clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean self.randn_clip_value = randn_clip_value + # Whether to apply tanh to the **sampled** action --- used in SAC + self.tanh_output = tanh_output + def loss( self, true_action, cond, ent_coef, ): + """no squashing""" B = len(true_action) dist = self.forward_train( cond, @@ -82,7 +87,6 @@ def forward( network_override=None, reparameterize=False, get_logprob=False, - apply_squashing=False, ): B = len(cond["state"]) if "state" in cond else len(cond["rgb"]) T = self.horizon_steps @@ -103,8 +107,8 @@ def forward( if get_logprob: log_prob = dist.log_prob(sampled_action) - # Right now we only apply squashing for SAC/RLPD, but not PPO - if apply_squashing: + # For SAC/RLPD, squash mean after sampling here instead of right after model output as in PPO + if self.tanh_output: sampled_action_squashed = torch.tanh(sampled_action) log_prob -= torch.log(1 - sampled_action_squashed.pow(2) + 1e-6) log_prob = log_prob.sum(1, keepdim=False) @@ -112,6 +116,6 @@ def forward( else: return sampled_action.view(B, T, -1), log_prob else: - if apply_squashing: + if self.tanh_output: sampled_action = torch.tanh(sampled_action) return sampled_action.view(B, T, -1) diff --git a/model/common/mlp.py b/model/common/mlp.py index 63d5e57..3322af9 100644 --- a/model/common/mlp.py +++ b/model/common/mlp.py @@ -7,7 +7,6 @@ import torch from torch import nn -from torch.nn.utils import spectral_norm from collections import OrderedDict import logging @@ -26,7 +25,6 @@ class MLP(nn.Module): - def __init__( self, dim_list, @@ -35,8 +33,9 @@ def __init__( activation_type="Tanh", out_activation_type="Identity", use_layernorm=False, - use_spectralnorm=False, use_layernorm_final=False, + dropout=0, + use_drop_final=False, verbose=False, ): super(MLP, self).__init__() @@ -51,39 +50,25 @@ def __init__( o_dim = dim_list[idx + 1] if append_dim > 0 and idx in append_layers: i_dim += append_dim - linear_layer = nn.Linear(i_dim, o_dim) - if use_spectralnorm: - linear_layer = spectral_norm(linear_layer) - if idx == num_layer - 1 and not use_layernorm_final: - module = nn.Sequential( - OrderedDict( - [ - ("linear_1", linear_layer), - ("act_1", activation_dict[out_activation_type]), - ] - ) - ) - else: - if use_layernorm: - module = nn.Sequential( - OrderedDict( - [ - ("linear_1", linear_layer), - ("norm_1", nn.LayerNorm(o_dim)), - ("act_1", activation_dict[activation_type]), - ] - ) - ) - else: - module = nn.Sequential( - OrderedDict( - [ - ("linear_1", linear_layer), - ("act_1", activation_dict[activation_type]), - ] - ) - ) + + # Add module components + layers = [("linear_1", linear_layer)] + if use_layernorm and (idx < num_layer - 1 or use_layernorm_final): + layers.append(("norm_1", nn.LayerNorm(o_dim))) + if dropout > 0 and (idx < num_layer - 1 or use_drop_final): + layers.append(("dropout_1", nn.Dropout(dropout))) + + # add activation function + act = ( + activation_dict[activation_type] + if idx != num_layer - 1 + else activation_dict[out_activation_type] + ) + layers.append(("act_1", act)) + + # re-construct module + module = nn.Sequential(OrderedDict(layers)) self.moduleList.append(module) if verbose: logging.info(self.moduleList) @@ -110,6 +95,7 @@ def __init__( activation_type="Mish", out_activation_type="Identity", use_layernorm=False, + use_layernorm_final=False, ): super(ResidualMLP, self).__init__() hidden_dim = dim_list[1] @@ -127,6 +113,8 @@ def __init__( ] ) self.layers.append(nn.Linear(hidden_dim, dim_list[-1])) + if use_layernorm_final: + self.layers.append(nn.LayerNorm(dim_list[-1])) self.layers.append(activation_dict[out_activation_type]) def forward(self, x): diff --git a/model/common/mlp_gaussian.py b/model/common/mlp_gaussian.py index a83a735..26d317f 100644 --- a/model/common/mlp_gaussian.py +++ b/model/common/mlp_gaussian.py @@ -186,8 +186,10 @@ def __init__( cond_dim, mlp_dims=[256, 256, 256], activation_type="Mish", + tanh_output=True, # sometimes we want to apply tanh after sampling instead of here, e.g., in SAC residual_style=False, use_layernorm=False, + dropout=0.0, fixed_std=None, learn_fixed_std=False, std_min=0.01, @@ -226,6 +228,7 @@ def __init__( activation_type=activation_type, out_activation_type="Identity", use_layernorm=use_layernorm, + dropout=dropout, ) if learn_fixed_std: # initialize to fixed_std @@ -244,6 +247,7 @@ def __init__( self.use_fixed_std = fixed_std is not None self.fixed_std = fixed_std self.learn_fixed_std = learn_fixed_std + self.tanh_output = tanh_output def forward(self, cond): B = len(cond["state"]) @@ -256,9 +260,9 @@ def forward(self, cond): if hasattr(self, "mlp_base"): state = self.mlp_base(state) out_mean = self.mlp_mean(state) - out_mean = torch.tanh(out_mean).view( - B, self.horizon_steps * self.transition_dim - ) # [-1, 1] + if self.tanh_output: + out_mean = torch.tanh(out_mean) + out_mean = out_mean.view(B, self.horizon_steps * self.transition_dim) if self.learn_fixed_std: out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max) diff --git a/model/rl/gaussian_ibrl.py b/model/rl/gaussian_ibrl.py new file mode 100644 index 0000000..4552111 --- /dev/null +++ b/model/rl/gaussian_ibrl.py @@ -0,0 +1,193 @@ +""" +Imitation Bootstrapped Reinforcement Learning (IBRL) for Gaussian policy. + +""" + +import torch +import torch.nn as nn +import logging +from copy import deepcopy + +from model.common.gaussian import GaussianModel + +log = logging.getLogger(__name__) + + +class IBRL_Gaussian(GaussianModel): + def __init__( + self, + actor, + critic, + n_critics, + soft_action_sample=False, + soft_action_sample_beta=0.1, + **kwargs, + ): + super().__init__(network=actor, **kwargs) + self.soft_action_sample = soft_action_sample + self.soft_action_sample_beta = soft_action_sample_beta + + # Set up target actor + self.target_actor = deepcopy(actor) + + # Frozen pre-trained policy + self.imitation_policy = deepcopy(actor) + for param in self.imitation_policy.parameters(): + param.requires_grad = False + + # initialize critic networks + self.critic_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.critic_networks = nn.ModuleList(self.critic_networks) + + # initialize target networks + self.target_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.target_networks = nn.ModuleList(self.target_networks) + + # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage. + base_model = deepcopy(self.critic_networks[0]) + self.base_model = base_model.to("meta") + self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state( + self.critic_networks + ) + + def critic_wrapper(self, params, buffers, data): + """for vmap""" + return torch.func.functional_call(self.base_model, (params, buffers), data) + + def get_random_indices(self, sz=None, num_ind=2): + """get num_ind random indices from a set of size sz (used for getting critic targets)""" + if sz is None: + sz = len(self.critic_networks) + perm = torch.randperm(sz) + ind = perm[:num_ind].to(self.device) + return ind + + def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma): + # get random critic index + q1_ind, q2_ind = self.get_random_indices() + with torch.no_grad(): + next_actions_il = super().forward( + cond=next_obs, + deterministic=False, + network_override=self.imitation_policy, + ) + next_actions_rl = super().forward( + cond=next_obs, + deterministic=False, + network_override=self.target_actor, + ) + + # get the IL Q value + next_q1_il = self.target_networks[q1_ind](next_obs, next_actions_il) + next_q2_il = self.target_networks[q2_ind](next_obs, next_actions_il) + next_q_il = torch.min(next_q1_il, next_q2_il) + + # get the RL Q value + next_q1_rl = self.target_networks[q1_ind](next_obs, next_actions_rl) + next_q2_rl = self.target_networks[q2_ind](next_obs, next_actions_rl) + next_q_rl = torch.min(next_q1_rl, next_q2_rl) + + # take the max Q value + next_q = torch.where(next_q_il > next_q_rl, next_q_il, next_q_rl) + + # target value + target_q = rewards + gamma * (1 - dones) * next_q # (B,) + + # run all critics in batch + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, actions) + ) # (n_critics, B) + loss_critic = torch.mean((current_q - target_q[None]) ** 2) + return loss_critic + + def loss_actor(self, obs): + action = super().forward( + obs, + deterministic=False, + reparameterize=True, + ) # use online policy only, also IBRL does not use tanh squashing + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, action) + ) # (n_critics, B) + current_q = current_q.min( + dim=0 + ).values # unlike RLPD, IBRL uses the min Q value for actor update + loss_actor = -torch.mean(current_q) + return loss_actor + + def update_target_critic(self, tau): + """need to use ensemble_params instead of critic_networks""" + for target_ind, target_critic in enumerate(self.target_networks): + for target_param_name, target_param in target_critic.named_parameters(): + source_param = self.ensemble_params[target_param_name][target_ind] + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + def update_target_actor(self, tau): + for target_param, source_param in zip( + self.target_actor.parameters(), self.network.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + # ---------- Sampling ----------# + + def forward( + self, + cond, + deterministic=False, + reparameterize=False, + ): + """use both pre-trained and online policies""" + q1_ind, q2_ind = self.get_random_indices() + + # sample an action from the imitation policy + imitation_action = super().forward( + cond=cond, + deterministic=deterministic, + network_override=self.imitation_policy, + ) + + # sample an action from the RL policy + rl_action = super().forward( + cond=cond, + deterministic=deterministic, + reparameterize=reparameterize, + ) + + # compute Q value of imitation policy + q_imitation_1 = self.critic_networks[q1_ind](cond, imitation_action) # (B,) + q_imitation_2 = self.critic_networks[q2_ind](cond, imitation_action) + q_imitation = torch.min(q_imitation_1, q_imitation_2) + + # compute Q value of RL policy + q_rl_1 = self.critic_networks[q1_ind](cond, rl_action) + q_rl_2 = self.critic_networks[q2_ind](cond, rl_action) + q_rl = torch.min(q_rl_1, q_rl_2) + + # soft sample or greedy + if self.soft_action_sample: + # compute the Q weights with probability proportional to exp(\beta * Q(a)) + qw_il = torch.exp(q_imitation * self.soft_action_sample_beta) + qw_rl = torch.exp(q_rl * self.soft_action_sample_beta) + q_weights = torch.softmax( + torch.stack([qw_il, qw_rl], dim=-1), + dim=-1, + ) + + # sample according to the weights + q_indices = torch.multinomial(q_weights, 1) + action = torch.where((q_indices == 0)[:, None], imitation_action, rl_action) + else: + action = torch.where( + q_imitation > q_rl[:, None, None], + imitation_action, + rl_action, + ) + return action diff --git a/model/rl/gaussian_rlpd.py b/model/rl/gaussian_rlpd.py index 1745781..6623253 100644 --- a/model/rl/gaussian_rlpd.py +++ b/model/rl/gaussian_rlpd.py @@ -40,8 +40,7 @@ def __init__( ] self.target_networks = nn.ModuleList(self.target_networks) - # Construct a "stateless" version of one of the models. It is "stateless" in - # the sense that the parameters are meta Tensors and do not have storage. + # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage. base_model = deepcopy(self.critic_networks[0]) self.base_model = base_model.to("meta") self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state( @@ -85,10 +84,6 @@ def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma, alpha): self.ensemble_params, self.ensemble_buffers, (obs, actions) ) # (n_critics, B) loss_critic = torch.mean((current_q - target_q[None]) ** 2) - # current_q = torch.stack( - # [critic(obs, actions) for critic in self.critic_networks], dim=-1 - # ) # (B, n_critics) - # loss_critic = torch.mean((current_q - target_q.unsqueeze(-1)) ** 2) return loss_critic def loss_actor(self, obs, alpha): @@ -102,10 +97,6 @@ def loss_actor(self, obs, alpha): self.ensemble_params, self.ensemble_buffers, (obs, action) ) # (n_critics, B) current_q = current_q.mean(dim=0) + alpha * (-logprob) - # current_q = torch.stack( - # [critic(obs, action) for critic in self.critic_networks], dim=-1 - # ) # (B, n_critics) - # current_q = current_q.mean(dim=-1) + alpha * (-logprob) loss_actor = -torch.mean(current_q) return loss_actor @@ -121,15 +112,6 @@ def loss_temperature(self, obs, alpha, target_entropy): def update_target_critic(self, tau): """need to use ensemble_params instead of critic_networks""" - # for target_critic, source_critic in zip( - # self.target_networks, self.critic_networks - # ): - # for target_param, source_param in zip( - # target_critic.parameters(), source_critic.parameters() - # ): - # target_param.data.copy_( - # target_param.data * (1.0 - tau) + source_param.data * tau - # ) for target_ind, target_critic in enumerate(self.target_networks): for target_param_name, target_param in target_critic.named_parameters(): source_param = self.ensemble_params[target_param_name][target_ind] diff --git a/model/rl/gaussian_sac.py b/model/rl/gaussian_sac.py index dbebaa0..c8ad1c4 100644 --- a/model/rl/gaussian_sac.py +++ b/model/rl/gaussian_sac.py @@ -76,20 +76,3 @@ def update_target_critic(self, tau): target_param.data.copy_( target_param.data * (1.0 - tau) + source_param.data * tau ) - - # ---------- Sampling ----------# - - def forward( - self, - cond, - deterministic=False, - reparameterize=False, # allow gradient - get_logprob=False, - ): - return super().forward( - cond=cond, - deterministic=deterministic, - reparameterize=reparameterize, - get_logprob=get_logprob, - apply_squashing=True, - ) diff --git a/script/dataset/README.md b/script/dataset/README.md index 8d71e7b..8591434 100644 --- a/script/dataset/README.md +++ b/script/dataset/README.md @@ -1,3 +1,17 @@ ## Data processing scripts -These are some scripts used for processing the raw datasets from the benchmarks. We already pre-processed them and provide the final datasets. These scripts are for information only. \ No newline at end of file +These are some scripts used for processing the raw datasets from the benchmarks. We already pre-processed them and provide the final datasets. These scripts are for information only. + +```console +python script/dataset/get_d4rl_dataset.py --env_name=hopper-medium-v2 --save_dir=data/gym/hopper-medium-v2 +python script/dataset/process_robomimic_dataset.py --load_path=../robomimic_raw_data/lift_low_dim_v141.hdf5 --save_dir=data/robomimic/lift --normalize +``` + +Thw raw robomimic data can be downloaded with a clone of the repository and then +```console +cd ~/robomimic/robomimic/scripts +python download_datasets.py --tasks all --dataset_types mh --hdf5_types low_dim # state-only policy +python download_datasets.py --tasks all --dataset_types mh --hdf5_types raw # pixel-based policy +# for pixel, replay the trajectories to extract image observations +python robomimic/scripts/dataset_states_to_obs.py --done_mode 2 --dataset datasets/can/mh/demo_v141.hdf5 --output_name image_v141.hdf5 --camera_names robot0_eye_in_hand --camera_height 96 --camera_width 96 --exclude-next-obs --n 100 +``` \ No newline at end of file diff --git a/script/dataset/get_d4rl_dataset.py b/script/dataset/get_d4rl_dataset.py index f52bf43..76ddb9f 100644 --- a/script/dataset/get_d4rl_dataset.py +++ b/script/dataset/get_d4rl_dataset.py @@ -77,14 +77,16 @@ def make_dataset(env_name, save_dir, save_name_prefix, val_split, logger): # Get the trajectory length and slice traj_length = cur_index - prev_index + 1 - trajectory = {key: dataset[key][prev_index : cur_index + 1] for key in ["states", "actions", "rewards"]} + trajectory = { + key: dataset[key][prev_index : cur_index + 1] + for key in ["states", "actions", "rewards"] + } # Skip if there is no reward in the episode if np.sum(trajectory["rewards"]) > 0: # Scale observations and actions trajectory["states"] = ( - 2 * (trajectory["states"] - obs_min) / (obs_max - obs_min + 1e-6) - - 1 + 2 * (trajectory["states"] - obs_min) / (obs_max - obs_min + 1e-6) - 1 ) trajectory["actions"] = ( 2