From b6769612dcbf90527aba144a933a4871ad99c9ec Mon Sep 17 00:00:00 2001
From: Justin Lidard <60638575+jlidard@users.noreply.github.com>
Date: Fri, 27 Sep 2024 12:39:32 -0400
Subject: [PATCH] Ibrl (#2)

* remove dataset consistency check

* add pretrain configs

* rename

* transport pretrain cfg

* add ibrl

* fix base policy

* set `deterministic=True` when sampling in diffusion evaluation

* minors

* Revert "add rlpd framework"

* Revert "Revert "add rlpd framework"" (#4)

* match rlpd param names

* rename to `StitchedSequenceQLearningDataset`

* add configs

* add `tanh_output` and dropout to gaussians

* fix ibrl

* minors

---------

Co-authored-by: Justin M. Lidard <jlidard@neuronic.cs.princeton.edu>
Co-authored-by: allenzren <allen.ren@princeton.edu>
---
 agent/finetune/train_ibrl_agent.py            | 334 ++++++++++++++++++
 agent/finetune/train_sac_agent.py             |  18 +-
 cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml |  54 +++
 cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml | 107 ++++++
 cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml  |  98 +++++
 cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml      | 108 ++++++
 cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml      |   2 +-
 cfg/gym/finetune/hopper-v2/sac_mlp.yaml       |   3 +-
 .../pre_gaussian_mlp.yaml                     |  61 ++++
 .../hopper-medium-v2/pre_gaussian_mlp.yaml    |  60 ++++
 .../walker2d-medium-v2/pre_gaussian_mlp.yaml  |  60 ++++
 cfg/robomimic/finetune/square/rlpd_mlp.yaml   | 114 ++++++
 .../pretrain/can/pre_gaussian_mlp_ibrl.yaml   |  59 ++++
 .../pretrain/lift/pre_gaussian_mlp_ibrl.yaml  |  60 ++++
 .../square/pre_gaussian_mlp_ibrl.yaml         |  60 ++++
 .../transport/pre_gaussian_mlp_ibrl.yaml      |  60 ++++
 model/common/gaussian.py                      |  14 +-
 model/common/mlp.py                           |  58 ++-
 model/common/mlp_gaussian.py                  |  10 +-
 model/rl/gaussian_ibrl.py                     | 193 ++++++++++
 model/rl/gaussian_rlpd.py                     |  20 +-
 model/rl/gaussian_sac.py                      |  17 -
 script/dataset/README.md                      |  16 +-
 script/dataset/get_d4rl_dataset.py            |   8 +-
 24 files changed, 1502 insertions(+), 92 deletions(-)
 create mode 100644 agent/finetune/train_ibrl_agent.py
 create mode 100644 cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml
 create mode 100644 cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
 create mode 100644 cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
 create mode 100644 cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml
 create mode 100644 cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml
 create mode 100644 cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml
 create mode 100644 cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml
 create mode 100644 cfg/robomimic/finetune/square/rlpd_mlp.yaml
 create mode 100644 cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml
 create mode 100644 cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml
 create mode 100644 cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml
 create mode 100644 cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml
 create mode 100644 model/rl/gaussian_ibrl.py

diff --git a/agent/finetune/train_ibrl_agent.py b/agent/finetune/train_ibrl_agent.py
new file mode 100644
index 0000000..332db5c
--- /dev/null
+++ b/agent/finetune/train_ibrl_agent.py
@@ -0,0 +1,334 @@
+"""
+Imitation Bootstrapped Reinforcement Learning (IBRL) agent training script.
+
+Does not support image observations right now. 
+"""
+
+import os
+import pickle
+import numpy as np
+import torch
+import logging
+import wandb
+import hydra
+from collections import deque
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainIBRLAgent(TrainAgent):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Build dataset
+        self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.network.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.critic_networks.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Perturbation scale
+        self.target_ema_rate = cfg.train.target_ema_rate
+
+        # Reward scale
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Number of critic updates
+        self.critic_num_update = cfg.train.critic_num_update
+
+        # Update frequency
+        self.update_freq = cfg.train.update_freq
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Eval episodes
+        self.n_eval_episode = cfg.train.n_eval_episode
+
+        # Exploration steps at the beginning - using randomly sampled action
+        self.n_explore_steps = cfg.train.n_explore_steps
+
+    def run(self):
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        done_buffer = deque(maxlen=self.buffer_size)
+
+        # collect the offline dataset
+        states = self.dataset_offline.states
+        next_states = torch.roll(states, shifts=1, dims=0)
+        next_states[0] = 0
+        actions = self.dataset_offline.actions
+        rewards = self.dataset_offline.rewards
+        dones = self.dataset_offline.dones
+
+        # initailize the replay buffer with offline data
+        obs_buffer.extend(states[: self.buffer_size, None].cpu().numpy())
+        next_obs_buffer.extend(next_states[: self.buffer_size, None].cpu().numpy())
+        action_buffer.extend(actions[: self.buffer_size, None].cpu().numpy())
+        reward_buffer.extend(rewards[: self.buffer_size].cpu().numpy())
+        done_buffer.extend(dones[: self.buffer_size].cpu().numpy())
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+            if self.itr % 1000 == 0:
+                print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = (
+                self.itr % self.val_freq == 0
+                and self.itr > self.n_explore_steps
+                and not self.force_train
+            )
+            n_steps = (
+                self.n_steps if not eval_mode else int(1e5)
+            )  # large number for eval mode
+            self.model.eval() if eval_mode else self.model.train()
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
+            firsts_trajs = np.empty((0, self.n_envs))
+            if self.reset_at_iteration or eval_mode or self.itr == 0:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs = np.vstack((firsts_trajs, np.ones((1, self.n_envs))))
+            else:
+                # if done at the end of last iteration, then the envs are just reset
+                firsts_trajs = np.vstack((firsts_trajs, done_venv))
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            cnt_episode = 0
+            for _ in range(n_steps):
+
+                # Select action
+                if self.itr < self.n_explore_steps:
+                    action_venv = self.venv.action_space.sample()
+                else:
+                    with torch.no_grad():
+                        cond = {
+                            "state": torch.from_numpy(prev_obs_venv["state"])
+                            .float()
+                            .to(self.device)
+                        }
+                        samples = (
+                            self.model(
+                                cond=cond,
+                                deterministic=eval_mode,
+                            )
+                            .cpu()
+                            .numpy()
+                        )  # n_env x horizon x act
+                    action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+
+                # add to buffer in train mode
+                if not eval_mode:
+                    for i in range(self.n_envs):
+                        obs_buffer.append(prev_obs_venv["state"][i])
+                        next_obs_buffer.append(obs_venv["state"][i])
+                        action_buffer.append(action_venv[i])
+                        reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
+                        done_buffer.append(done_venv[i])
+                firsts_trajs = np.vstack(
+                    (firsts_trajs, done_venv)
+                )  # offset by one step
+                prev_obs_venv = obs_venv
+
+                # check if enough eval episodes are done
+                cnt_episode += np.sum(done_venv)
+                if eval_mode and cnt_episode >= self.n_eval_episode:
+                    break
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+
+            # Update models
+            if (
+                not eval_mode
+                and self.itr > self.n_explore_steps
+                and self.itr % self.update_freq == 0
+            ):
+                obs_array = np.array(obs_buffer)
+                next_obs_array = np.array(next_obs_buffer)
+                actions_array = np.array(action_buffer)
+                rewards_array = np.array(reward_buffer)
+                dones_array = np.array(done_buffer)
+
+                # Update critic more frequently
+                for _ in range(self.critic_num_update):
+                    # Sample from online buffer
+                    inds = np.random.choice(len(obs_buffer), self.batch_size)
+                    obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device)
+                    next_obs_b = (
+                        torch.from_numpy(next_obs_array[inds]).float().to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(actions_array[inds]).float().to(self.device)
+                    )
+                    rewards_b = (
+                        torch.from_numpy(rewards_array[inds]).float().to(self.device)
+                    )
+                    dones_b = (
+                        torch.from_numpy(dones_array[inds]).float().to(self.device)
+                    )
+                    # Update critic
+                    loss_critic = self.model.loss_critic(
+                        {"state": obs_b},
+                        {"state": next_obs_b},
+                        actions_b,
+                        rewards_b,
+                        dones_b,
+                        self.gamma,
+                    )
+                    self.critic_optimizer.zero_grad()
+                    loss_critic.backward()
+                    self.critic_optimizer.step()
+
+                    # Update target critic every critic update
+                    self.model.update_target_critic(self.target_ema_rate)
+
+                # Update actor once with the final batch
+                loss_actor = self.model.loss_actor(
+                    {"state": obs_b},
+                )
+                self.actor_optimizer.zero_grad()
+                loss_actor.backward()
+                self.actor_optimizer.step()
+
+                # Update target actor
+                self.model.update_target_actor(self.target_ema_rate)
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append({"itr": self.itr})
+            if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
+                time = timer()
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss - actor": loss_actor,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss_actor"] = loss_actor
+                    run_results[-1]["loss_critic"] = loss_critic
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = time
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_sac_agent.py b/agent/finetune/train_sac_agent.py
index 27dccc1..cc4798e 100644
--- a/agent/finetune/train_sac_agent.py
+++ b/agent/finetune/train_sac_agent.py
@@ -62,8 +62,12 @@ def __init__(self, cfg):
         self.scale_reward_factor = cfg.train.scale_reward_factor
 
         # Actor/critic update frequency - assume single env
-        self.critic_update_freq = int(cfg.train.batch_size / cfg.train.critic_replay_ratio)
-        self.actor_update_freq = int(cfg.train.batch_size / cfg.train.actor_replay_ratio)
+        self.critic_update_freq = int(
+            cfg.train.batch_size / cfg.train.critic_replay_ratio
+        )
+        self.actor_update_freq = int(
+            cfg.train.batch_size / cfg.train.actor_replay_ratio
+        )
 
         # Buffer size
         self.buffer_size = cfg.train.buffer_size
@@ -215,10 +219,12 @@ def run(self):
                 success_rate = 0
 
             # Update models
-            if not eval_mode and self.itr > self.n_explore_steps and self.itr % self.critic_update_freq == 0:
-                inds = np.random.choice(
-                    len(obs_buffer), self.batch_size, replace=False
-                )
+            if (
+                not eval_mode
+                and self.itr > self.n_explore_steps
+                and self.itr % self.critic_update_freq == 0
+            ):
+                inds = np.random.choice(len(obs_buffer), self.batch_size, replace=False)
                 obs_b = (
                     torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
                     .float()
diff --git a/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml b/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml
new file mode 100644
index 0000000..9759941
--- /dev/null
+++ b/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml
@@ -0,0 +1,54 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
+
+name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+n_steps: 1000  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
+render_num: 0
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3  # success rate not relevant for gym tasks
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
new file mode 100644
index 0000000..dcdef35
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: rlpd-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 250000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 3e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 50000
+  val_freq: 1000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.005
+  scale_reward_factor: 0.1
+  critic_num_update: 2
+  buffer_size: 1000000
+  n_eval_episode: 10
+  n_explore_steps: 2000 # used in RLPD locomotion
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps} / 2'}
+  init_temperature: 1
+
+model:
+  _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+  randn_clip_value: 10
+  backup_entropy: True
+  n_critics: 5 # Ensemble size for critic models
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 2.061e-9
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
new file mode 100644
index 0000000..622c5ba
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
@@ -0,0 +1,98 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_sac_agent.TrainSACAgent
+
+name: ${env_name}_sac_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step: 
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: sac-gym-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 3e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100000
+  val_freq: 10000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # SAC specific
+  batch_size: 128
+  target_ema_rate: 0.005
+  scale_reward_factor: 0.1
+  critic_replay_ratio: 128
+  actor_replay_ratio: 64
+  buffer_size: 1000000
+  n_eval_episode: 10
+  n_explore_steps: 5000
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps} / 2'}
+  init_temperature: 0.2
+
+model:
+  _target_: model.rl.gaussian_sac.SAC_Gaussian
+  randn_clip_value: 10
+  tanh_output: True # squash after sampling
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 2.061e-9
+  critic: # no layernorm
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
diff --git a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml b/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml
new file mode 100644
index 0000000..8c2e601
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_gaussian_mlp_ta1/2024-09-25_14-57-07_42/checkpoint/state_1000.pt
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step: 
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: ibrl-gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 250000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 5000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01        
+  scale_reward_factor: 0.1
+  critic_num_update: 5
+  buffer_size: 1000000
+  n_eval_episode: 10
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+  network_path: ${base_policy_path}
+  randn_clip_value: 3
+  n_critics: 5            # Ensemble size for critic models
+  soft_action_sample: True
+  soft_action_sample_beta: 0.1
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${action_dim}
+    fixed_std: 0.1
+    # dropout: 0.5
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml b/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml
index 8b9ec1f..f40e884 100644
--- a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml
+++ b/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml
@@ -55,7 +55,7 @@ train:
     first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 3e-4
-  save_model_freq: 100000
+  save_model_freq: 50000
   val_freq: 5000
   render:
     freq: 1
diff --git a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml
index 560f37f..803d334 100644
--- a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml
+++ b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml
@@ -36,7 +36,7 @@ env:
 
 wandb:
   entity: ${oc.env:DPPO_WANDB_ENTITY}
-  project: sac-gym-${env_name}-finetune
+  project: sac-gym-${env_name}
   run: ${now:%H-%M-%S}_${name}
 
 train:
@@ -85,7 +85,6 @@ model:
     transition_dim: ${action_dim}
     std_max: 7.3891
     std_min: 2.061e-9
-    # fixed_std: 0.1
   critic: # no layernorm
     _target_: model.common.critic.CriticObsAct
     mlp_dims: [256, 256]
diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..5c4fb7f
--- /dev/null
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml
@@ -0,0 +1,61 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:1
+env: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain-gaussian
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 128
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 3000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 100
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    residual_style: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    fixed_std: 0.1
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..9a97086
--- /dev/null
+++ b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 500
+  batch_size: 128
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 100
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..3917aff
--- /dev/null
+++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:1
+env: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain-gaussian
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 128
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 3000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 100
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256]
+    activation_type: ReLU
+    residual_style: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/rlpd_mlp.yaml b/cfg/robomimic/finetune/square/rlpd_mlp.yaml
new file mode 100644
index 0000000..a34d26a
--- /dev/null
+++ b/cfg/robomimic/finetune/square/rlpd_mlp.yaml
@@ -0,0 +1,114 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 400
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: rlpd-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 250000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 3e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 50000
+  val_freq: 5000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.005
+  scale_reward_factor: 0.1
+  critic_num_update: 20
+  buffer_size: 1000000
+  n_eval_episode: 10
+  n_explore_steps: 5000
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps} / 2'}
+  init_temperature: 1
+
+model:
+  _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+  randn_clip_value: 10
+  backup_entropy: True
+  n_critics: 10 # Ensemble size for critic models
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 2.061e-9
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    residual_style: False
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml
new file mode 100644
index 0000000..9bd074d
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml
@@ -0,0 +1,59 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain-gaussian
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml
new file mode 100644
index 0000000..69a49ae
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml
new file mode 100644
index 0000000..c78661f
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml
new file mode 100644
index 0000000..e9ca5f7
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/model/common/gaussian.py b/model/common/gaussian.py
index d7725fd..5246f38 100644
--- a/model/common/gaussian.py
+++ b/model/common/gaussian.py
@@ -19,6 +19,7 @@ def __init__(
         network_path=None,
         device="cuda:0",
         randn_clip_value=10,
+        tanh_output=False,
     ):
         super().__init__()
         self.device = device
@@ -29,7 +30,7 @@ def __init__(
             )
             self.load_state_dict(
                 checkpoint["model"],
-                strict=False,
+                strict=True,
             )
             log.info("Loaded actor from %s", network_path)
         log.info(
@@ -40,12 +41,16 @@ def __init__(
         # Clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean
         self.randn_clip_value = randn_clip_value
 
+        # Whether to apply tanh to the **sampled** action --- used in SAC
+        self.tanh_output = tanh_output
+
     def loss(
         self,
         true_action,
         cond,
         ent_coef,
     ):
+        """no squashing"""
         B = len(true_action)
         dist = self.forward_train(
             cond,
@@ -82,7 +87,6 @@ def forward(
         network_override=None,
         reparameterize=False,
         get_logprob=False,
-        apply_squashing=False,
     ):
         B = len(cond["state"]) if "state" in cond else len(cond["rgb"])
         T = self.horizon_steps
@@ -103,8 +107,8 @@ def forward(
         if get_logprob:
             log_prob = dist.log_prob(sampled_action)
 
-            # Right now we only apply squashing for SAC/RLPD, but not PPO
-            if apply_squashing:
+            # For SAC/RLPD, squash mean after sampling here instead of right after model output as in PPO
+            if self.tanh_output:
                 sampled_action_squashed = torch.tanh(sampled_action)
                 log_prob -= torch.log(1 - sampled_action_squashed.pow(2) + 1e-6)
                 log_prob = log_prob.sum(1, keepdim=False)
@@ -112,6 +116,6 @@ def forward(
             else:
                 return sampled_action.view(B, T, -1), log_prob
         else:
-            if apply_squashing:
+            if self.tanh_output:
                 sampled_action = torch.tanh(sampled_action)
             return sampled_action.view(B, T, -1)
diff --git a/model/common/mlp.py b/model/common/mlp.py
index 63d5e57..3322af9 100644
--- a/model/common/mlp.py
+++ b/model/common/mlp.py
@@ -7,7 +7,6 @@
 
 import torch
 from torch import nn
-from torch.nn.utils import spectral_norm
 from collections import OrderedDict
 import logging
 
@@ -26,7 +25,6 @@
 
 
 class MLP(nn.Module):
-
     def __init__(
         self,
         dim_list,
@@ -35,8 +33,9 @@ def __init__(
         activation_type="Tanh",
         out_activation_type="Identity",
         use_layernorm=False,
-        use_spectralnorm=False,
         use_layernorm_final=False,
+        dropout=0,
+        use_drop_final=False,
         verbose=False,
     ):
         super(MLP, self).__init__()
@@ -51,39 +50,25 @@ def __init__(
             o_dim = dim_list[idx + 1]
             if append_dim > 0 and idx in append_layers:
                 i_dim += append_dim
-
             linear_layer = nn.Linear(i_dim, o_dim)
-            if use_spectralnorm:
-                linear_layer = spectral_norm(linear_layer)
-            if idx == num_layer - 1 and not use_layernorm_final:
-                module = nn.Sequential(
-                    OrderedDict(
-                        [
-                            ("linear_1", linear_layer),
-                            ("act_1", activation_dict[out_activation_type]),
-                        ]
-                    )
-                )
-            else:
-                if use_layernorm:
-                    module = nn.Sequential(
-                        OrderedDict(
-                            [
-                                ("linear_1", linear_layer),
-                                ("norm_1", nn.LayerNorm(o_dim)),
-                                ("act_1", activation_dict[activation_type]),
-                            ]
-                        )
-                    )
-                else:
-                    module = nn.Sequential(
-                        OrderedDict(
-                            [
-                                ("linear_1", linear_layer),
-                                ("act_1", activation_dict[activation_type]),
-                            ]
-                        )
-                    )
+
+            # Add module components
+            layers = [("linear_1", linear_layer)]
+            if use_layernorm and (idx < num_layer - 1 or use_layernorm_final):
+                layers.append(("norm_1", nn.LayerNorm(o_dim)))
+            if dropout > 0 and (idx < num_layer - 1 or use_drop_final):
+                layers.append(("dropout_1", nn.Dropout(dropout)))
+
+            # add activation function
+            act = (
+                activation_dict[activation_type]
+                if idx != num_layer - 1
+                else activation_dict[out_activation_type]
+            )
+            layers.append(("act_1", act))
+
+            # re-construct module
+            module = nn.Sequential(OrderedDict(layers))
             self.moduleList.append(module)
         if verbose:
             logging.info(self.moduleList)
@@ -110,6 +95,7 @@ def __init__(
         activation_type="Mish",
         out_activation_type="Identity",
         use_layernorm=False,
+        use_layernorm_final=False,
     ):
         super(ResidualMLP, self).__init__()
         hidden_dim = dim_list[1]
@@ -127,6 +113,8 @@ def __init__(
             ]
         )
         self.layers.append(nn.Linear(hidden_dim, dim_list[-1]))
+        if use_layernorm_final:
+            self.layers.append(nn.LayerNorm(dim_list[-1]))
         self.layers.append(activation_dict[out_activation_type])
 
     def forward(self, x):
diff --git a/model/common/mlp_gaussian.py b/model/common/mlp_gaussian.py
index a83a735..26d317f 100644
--- a/model/common/mlp_gaussian.py
+++ b/model/common/mlp_gaussian.py
@@ -186,8 +186,10 @@ def __init__(
         cond_dim,
         mlp_dims=[256, 256, 256],
         activation_type="Mish",
+        tanh_output=True,  # sometimes we want to apply tanh after sampling instead of here, e.g., in SAC
         residual_style=False,
         use_layernorm=False,
+        dropout=0.0,
         fixed_std=None,
         learn_fixed_std=False,
         std_min=0.01,
@@ -226,6 +228,7 @@ def __init__(
                 activation_type=activation_type,
                 out_activation_type="Identity",
                 use_layernorm=use_layernorm,
+                dropout=dropout,
             )
             if learn_fixed_std:
                 # initialize to fixed_std
@@ -244,6 +247,7 @@ def __init__(
         self.use_fixed_std = fixed_std is not None
         self.fixed_std = fixed_std
         self.learn_fixed_std = learn_fixed_std
+        self.tanh_output = tanh_output
 
     def forward(self, cond):
         B = len(cond["state"])
@@ -256,9 +260,9 @@ def forward(self, cond):
         if hasattr(self, "mlp_base"):
             state = self.mlp_base(state)
         out_mean = self.mlp_mean(state)
-        out_mean = torch.tanh(out_mean).view(
-            B, self.horizon_steps * self.transition_dim
-        )  # [-1, 1]
+        if self.tanh_output:
+            out_mean = torch.tanh(out_mean)
+        out_mean = out_mean.view(B, self.horizon_steps * self.transition_dim)
 
         if self.learn_fixed_std:
             out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max)
diff --git a/model/rl/gaussian_ibrl.py b/model/rl/gaussian_ibrl.py
new file mode 100644
index 0000000..4552111
--- /dev/null
+++ b/model/rl/gaussian_ibrl.py
@@ -0,0 +1,193 @@
+"""
+Imitation Bootstrapped Reinforcement Learning (IBRL) for Gaussian policy.
+
+"""
+
+import torch
+import torch.nn as nn
+import logging
+from copy import deepcopy
+
+from model.common.gaussian import GaussianModel
+
+log = logging.getLogger(__name__)
+
+
+class IBRL_Gaussian(GaussianModel):
+    def __init__(
+        self,
+        actor,
+        critic,
+        n_critics,
+        soft_action_sample=False,
+        soft_action_sample_beta=0.1,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.soft_action_sample = soft_action_sample
+        self.soft_action_sample_beta = soft_action_sample_beta
+
+        # Set up target actor
+        self.target_actor = deepcopy(actor)
+
+        # Frozen pre-trained policy
+        self.imitation_policy = deepcopy(actor)
+        for param in self.imitation_policy.parameters():
+            param.requires_grad = False
+
+        # initialize critic networks
+        self.critic_networks = [
+            deepcopy(critic).to(self.device) for _ in range(n_critics)
+        ]
+        self.critic_networks = nn.ModuleList(self.critic_networks)
+
+        # initialize target networks
+        self.target_networks = [
+            deepcopy(critic).to(self.device) for _ in range(n_critics)
+        ]
+        self.target_networks = nn.ModuleList(self.target_networks)
+
+        # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage.
+        base_model = deepcopy(self.critic_networks[0])
+        self.base_model = base_model.to("meta")
+        self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state(
+            self.critic_networks
+        )
+
+    def critic_wrapper(self, params, buffers, data):
+        """for vmap"""
+        return torch.func.functional_call(self.base_model, (params, buffers), data)
+
+    def get_random_indices(self, sz=None, num_ind=2):
+        """get num_ind random indices from a set of size sz (used for getting critic targets)"""
+        if sz is None:
+            sz = len(self.critic_networks)
+        perm = torch.randperm(sz)
+        ind = perm[:num_ind].to(self.device)
+        return ind
+
+    def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma):
+        # get random critic index
+        q1_ind, q2_ind = self.get_random_indices()
+        with torch.no_grad():
+            next_actions_il = super().forward(
+                cond=next_obs,
+                deterministic=False,
+                network_override=self.imitation_policy,
+            )
+            next_actions_rl = super().forward(
+                cond=next_obs,
+                deterministic=False,
+                network_override=self.target_actor,
+            )
+
+            # get the IL Q value
+            next_q1_il = self.target_networks[q1_ind](next_obs, next_actions_il)
+            next_q2_il = self.target_networks[q2_ind](next_obs, next_actions_il)
+            next_q_il = torch.min(next_q1_il, next_q2_il)
+
+            # get the RL Q value
+            next_q1_rl = self.target_networks[q1_ind](next_obs, next_actions_rl)
+            next_q2_rl = self.target_networks[q2_ind](next_obs, next_actions_rl)
+            next_q_rl = torch.min(next_q1_rl, next_q2_rl)
+
+            # take the max Q value
+            next_q = torch.where(next_q_il > next_q_rl, next_q_il, next_q_rl)
+
+            # target value
+            target_q = rewards + gamma * (1 - dones) * next_q  # (B,)
+
+        # run all critics in batch
+        current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))(
+            self.ensemble_params, self.ensemble_buffers, (obs, actions)
+        )  # (n_critics, B)
+        loss_critic = torch.mean((current_q - target_q[None]) ** 2)
+        return loss_critic
+
+    def loss_actor(self, obs):
+        action = super().forward(
+            obs,
+            deterministic=False,
+            reparameterize=True,
+        )  # use online policy only, also IBRL does not use tanh squashing
+        current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))(
+            self.ensemble_params, self.ensemble_buffers, (obs, action)
+        )  # (n_critics, B)
+        current_q = current_q.min(
+            dim=0
+        ).values  # unlike RLPD, IBRL uses the min Q value for actor update
+        loss_actor = -torch.mean(current_q)
+        return loss_actor
+
+    def update_target_critic(self, tau):
+        """need to use ensemble_params instead of critic_networks"""
+        for target_ind, target_critic in enumerate(self.target_networks):
+            for target_param_name, target_param in target_critic.named_parameters():
+                source_param = self.ensemble_params[target_param_name][target_ind]
+                target_param.data.copy_(
+                    target_param.data * (1.0 - tau) + source_param.data * tau
+                )
+
+    def update_target_actor(self, tau):
+        for target_param, source_param in zip(
+            self.target_actor.parameters(), self.network.parameters()
+        ):
+            target_param.data.copy_(
+                target_param.data * (1.0 - tau) + source_param.data * tau
+            )
+
+    # ---------- Sampling ----------#
+
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+        reparameterize=False,
+    ):
+        """use both pre-trained and online policies"""
+        q1_ind, q2_ind = self.get_random_indices()
+
+        # sample an action from the imitation policy
+        imitation_action = super().forward(
+            cond=cond,
+            deterministic=deterministic,
+            network_override=self.imitation_policy,
+        )
+
+        # sample an action from the RL policy
+        rl_action = super().forward(
+            cond=cond,
+            deterministic=deterministic,
+            reparameterize=reparameterize,
+        )
+
+        # compute Q value of imitation policy
+        q_imitation_1 = self.critic_networks[q1_ind](cond, imitation_action)  # (B,)
+        q_imitation_2 = self.critic_networks[q2_ind](cond, imitation_action)
+        q_imitation = torch.min(q_imitation_1, q_imitation_2)
+
+        # compute Q value of RL policy
+        q_rl_1 = self.critic_networks[q1_ind](cond, rl_action)
+        q_rl_2 = self.critic_networks[q2_ind](cond, rl_action)
+        q_rl = torch.min(q_rl_1, q_rl_2)
+
+        # soft sample or greedy
+        if self.soft_action_sample:
+            # compute the Q weights with probability proportional to exp(\beta * Q(a))
+            qw_il = torch.exp(q_imitation * self.soft_action_sample_beta)
+            qw_rl = torch.exp(q_rl * self.soft_action_sample_beta)
+            q_weights = torch.softmax(
+                torch.stack([qw_il, qw_rl], dim=-1),
+                dim=-1,
+            )
+
+            # sample according to the weights
+            q_indices = torch.multinomial(q_weights, 1)
+            action = torch.where((q_indices == 0)[:, None], imitation_action, rl_action)
+        else:
+            action = torch.where(
+                q_imitation > q_rl[:, None, None],
+                imitation_action,
+                rl_action,
+            )
+        return action
diff --git a/model/rl/gaussian_rlpd.py b/model/rl/gaussian_rlpd.py
index 1745781..6623253 100644
--- a/model/rl/gaussian_rlpd.py
+++ b/model/rl/gaussian_rlpd.py
@@ -40,8 +40,7 @@ def __init__(
         ]
         self.target_networks = nn.ModuleList(self.target_networks)
 
-        # Construct a "stateless" version of one of the models. It is "stateless" in
-        # the sense that the parameters are meta Tensors and do not have storage.
+        # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage.
         base_model = deepcopy(self.critic_networks[0])
         self.base_model = base_model.to("meta")
         self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state(
@@ -85,10 +84,6 @@ def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma, alpha):
             self.ensemble_params, self.ensemble_buffers, (obs, actions)
         )  # (n_critics, B)
         loss_critic = torch.mean((current_q - target_q[None]) ** 2)
-        # current_q = torch.stack(
-        #     [critic(obs, actions) for critic in self.critic_networks], dim=-1
-        # )  # (B, n_critics)
-        # loss_critic = torch.mean((current_q - target_q.unsqueeze(-1)) ** 2)
         return loss_critic
 
     def loss_actor(self, obs, alpha):
@@ -102,10 +97,6 @@ def loss_actor(self, obs, alpha):
             self.ensemble_params, self.ensemble_buffers, (obs, action)
         )  # (n_critics, B)
         current_q = current_q.mean(dim=0) + alpha * (-logprob)
-        # current_q = torch.stack(
-        #     [critic(obs, action) for critic in self.critic_networks], dim=-1
-        # )  # (B, n_critics)
-        # current_q = current_q.mean(dim=-1) + alpha * (-logprob)
         loss_actor = -torch.mean(current_q)
         return loss_actor
 
@@ -121,15 +112,6 @@ def loss_temperature(self, obs, alpha, target_entropy):
 
     def update_target_critic(self, tau):
         """need to use ensemble_params instead of critic_networks"""
-        # for target_critic, source_critic in zip(
-        #     self.target_networks, self.critic_networks
-        # ):
-        #     for target_param, source_param in zip(
-        #         target_critic.parameters(), source_critic.parameters()
-        #     ):
-        #         target_param.data.copy_(
-        #             target_param.data * (1.0 - tau) + source_param.data * tau
-        #         )
         for target_ind, target_critic in enumerate(self.target_networks):
             for target_param_name, target_param in target_critic.named_parameters():
                 source_param = self.ensemble_params[target_param_name][target_ind]
diff --git a/model/rl/gaussian_sac.py b/model/rl/gaussian_sac.py
index dbebaa0..c8ad1c4 100644
--- a/model/rl/gaussian_sac.py
+++ b/model/rl/gaussian_sac.py
@@ -76,20 +76,3 @@ def update_target_critic(self, tau):
             target_param.data.copy_(
                 target_param.data * (1.0 - tau) + source_param.data * tau
             )
-
-    # ---------- Sampling ----------#
-
-    def forward(
-        self,
-        cond,
-        deterministic=False,
-        reparameterize=False,  # allow gradient
-        get_logprob=False,
-    ):
-        return super().forward(
-            cond=cond,
-            deterministic=deterministic,
-            reparameterize=reparameterize,
-            get_logprob=get_logprob,
-            apply_squashing=True,
-        )
diff --git a/script/dataset/README.md b/script/dataset/README.md
index 8d71e7b..8591434 100644
--- a/script/dataset/README.md
+++ b/script/dataset/README.md
@@ -1,3 +1,17 @@
 ## Data processing scripts
 
-These are some scripts used for processing the raw datasets from the benchmarks. We already pre-processed them and provide the final datasets. These scripts are for information only.
\ No newline at end of file
+These are some scripts used for processing the raw datasets from the benchmarks. We already pre-processed them and provide the final datasets. These scripts are for information only.
+
+```console
+python script/dataset/get_d4rl_dataset.py --env_name=hopper-medium-v2 --save_dir=data/gym/hopper-medium-v2
+python script/dataset/process_robomimic_dataset.py --load_path=../robomimic_raw_data/lift_low_dim_v141.hdf5 --save_dir=data/robomimic/lift --normalize
+```
+
+Thw raw robomimic data can be downloaded with a clone of the repository and then
+```console
+cd ~/robomimic/robomimic/scripts
+python download_datasets.py --tasks all --dataset_types mh --hdf5_types low_dim # state-only policy
+python download_datasets.py --tasks all --dataset_types mh --hdf5_types raw # pixel-based policy
+# for pixel, replay the trajectories to extract image observations
+python robomimic/scripts/dataset_states_to_obs.py --done_mode 2 --dataset datasets/can/mh/demo_v141.hdf5 --output_name image_v141.hdf5 --camera_names robot0_eye_in_hand --camera_height 96 --camera_width 96 --exclude-next-obs --n 100
+```
\ No newline at end of file
diff --git a/script/dataset/get_d4rl_dataset.py b/script/dataset/get_d4rl_dataset.py
index f52bf43..76ddb9f 100644
--- a/script/dataset/get_d4rl_dataset.py
+++ b/script/dataset/get_d4rl_dataset.py
@@ -77,14 +77,16 @@ def make_dataset(env_name, save_dir, save_name_prefix, val_split, logger):
 
         # Get the trajectory length and slice
         traj_length = cur_index - prev_index + 1
-        trajectory = {key: dataset[key][prev_index : cur_index + 1] for key in ["states", "actions", "rewards"]}
+        trajectory = {
+            key: dataset[key][prev_index : cur_index + 1]
+            for key in ["states", "actions", "rewards"]
+        }
 
         # Skip if there is no reward in the episode
         if np.sum(trajectory["rewards"]) > 0:
             # Scale observations and actions
             trajectory["states"] = (
-                2 * (trajectory["states"] - obs_min) / (obs_max - obs_min + 1e-6)
-                - 1
+                2 * (trajectory["states"] - obs_min) / (obs_max - obs_min + 1e-6) - 1
             )
             trajectory["actions"] = (
                 2