From bc412b727a276e75154af577d622fbbdad2e9b2f Mon Sep 17 00:00:00 2001
From: josiahls <josiahls@users.noreply.github.com>
Date: Fri, 25 Oct 2019 22:31:20 -0400
Subject: [PATCH] Version 0_8_0: Stable (#9)

* Init new branch

* Added:

* Action object: For validating action sizes / dims, bundling important info

* State: For validating state sizes / dims, bundling important info

* Bounds: For determining the dtypes of its parent object, and determining if the object is discrete

* Initial mass refactor of MDPDataset

* Max Step expectation

* Fixed:

* MDPDataset episode iteration

Notes:

* Plan to add a generic MDPStep list validation function. A few things
to expect:
- There should never be 2 "done" steps in a row
- Right after a done step, the step counter should show 0.
- There should never be Nones in the state values.
- Need to check for bad copies

* Added:

* WrapperLossFunc for compatibility with the existing fastai fit function

* native fastai fit function compatibility :))))))))

* Single Learner that subclasses fastai Learner

Notes:

* Next Commit will have the old code removed

* Fixed:

* Memory handler. For now keeping k top. Less confusing implementation.

* Removed old code

Notes:

* Need to now revise the DQN's and DDPG's to be compatible.

* Fixed:

* DDPG compat

* Other DQN compat

Removed:

* nn and cnn generic functions for now. They were just way too
confusing :( . The DDPG works now also.

* Interpreter test code.

Notes:

* Interpreter is broken for now

* Redo Tests
---
 README.md                               |  41 +-
 azure-pipelines.yml                     |   2 +-
 docs_src/rl.agents.dqnfixedtarget.ipynb |  10 +-
 docs_src/rl.core.mdp_interpreter.ipynb  |  12 +-
 fast_rl/agents/BaseAgent.py             |  78 +-
 fast_rl/agents/DDPG.py                  | 127 ++--
 fast_rl/agents/DQN.py                   | 237 +++----
 fast_rl/core/Envs.py                    |  19 +-
 fast_rl/core/Interpreter.py             |  18 +-
 fast_rl/core/Learner.py                 |  37 -
 fast_rl/core/MarkovDecisionProcess.py   | 900 ++++++++++++------------
 fast_rl/core/agent_core.py              | 107 +--
 fast_rl/core/basic_train.py             |  31 +
 fast_rl/tests/test_DataBunch.py         |  40 +-
 fast_rl/tests/test_Envs.py              |  66 +-
 fast_rl/tests/test_Interpretation.py    |  50 --
 fast_rl/tests/test_MDPDataBunch.py      | 275 ++++++--
 fast_rl/tests/test_agent_core.py        |  46 +-
 fast_rl/tests/test_ddpg_models.py       |  42 +-
 fast_rl/tests/test_dqn_models.py        |  93 +--
 fast_rl/tests/test_metrics.py           |  10 -
 fast_rl/util/exceptions.py              |   4 +
 fast_rl/util/misc.py                    |   5 +
 fast_rl/util/random_thingy.py           |  23 +-
 24 files changed, 1040 insertions(+), 1233 deletions(-)
 delete mode 100644 fast_rl/core/Learner.py
 create mode 100644 fast_rl/core/basic_train.py
 create mode 100644 fast_rl/util/exceptions.py

diff --git a/README.md b/README.md
index 2803e12..4752e34 100644
--- a/README.md
+++ b/README.md
@@ -70,41 +70,10 @@ known environments. Prior to 1.0.0, new changes might break previous code versio
 working at their best. Post 1.0.0 will be more formal feature development with CI, unit testing, etc. 
 
 **Critical**
-- [X] 0.0.0 MDPDataBunch: Finished to the point of being useful. Please reference: `tests/test_Envs`
-Example:
-```python
-from fast_rl.core.Envs import Envs
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-
-# At present will try to load OpenAI, box2d, pybullet, atari, maze.
-# note "get_all_latest_envs" has a key inclusion and exclusion so if you don't have some of these envs installed, 
-# you can avoid this here. Certain envs just flat out do not work / are unusual. You are welcome to see how to get them
-# working.
-for env in Envs.get_all_latest_envs():
-    max_steps = 50  # Limit the number of per episode iterations for now.
-    print(f'Testing {env}')
-    mdp_databunch = MDPDataBunch.from_env(env, max_steps=max_steps, num_workers=0)
-    if mdp_databunch is None:
-        print(f'Env {env} is probably Mujoco... Add imports if you want and try on your own. Don\'t like '
-              f'proprietary engines like this. If you have any issues, feel free to make a PR!')
-    else:
-        epochs = 1 # N episodes to run
-        for epoch in range(epochs):
-            for state in mdp_databunch.train_dl:
-                # Instead of random action, you would have your agent here
-                mdp_databunch.train_ds.actions = mdp_databunch.train_ds.get_random_action()
-    
-            for state in mdp_databunch.valid_dl:
-                # Instead of random action, you would have your agent here and have exploration to 0
-                mdp_databunch.valid_ds.actions = mdp_databunch.valid_ds.get_random_action()
-```
-- [X] 0.1.0 DQN Agent: Reference `tests/test_Learner/test_basic_dqn_model_maze`. We use Learner callbacks for 
-handling the different fit behaviors. 
-
 Testable code:
 ```python
 from fast_rl.agents.DQN import DQN
-from fast_rl.core.Learner import AgentLearner
+from fast_rl.core.basic_train import AgentLearner
 from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
 
 data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human')
@@ -130,13 +99,15 @@ Usage example:
 ```python
 from fast_rl.agents.DQN import DQN
 from fast_rl.core.Interpreter import AgentInterpretationAlpha
-from fast_rl.core.Learner import AgentLearner
+from fast_rl.core.basic_train import AgentLearner
 from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
 
 data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human')
 model = DQN(data)
 learn = AgentLearner(data, model)
 learn.fit(10)
+
+# Note that the Interpretation is broken, will be fixed with documentation in 0.9
 interp = AgentInterpretationAlpha(learn)
 interp.plot_heatmapped_episode(5)
 ```
@@ -229,8 +200,8 @@ learn.fit(5)
 reset commit
 
 - [X] 0.7.0 Full test suite using multi-processing. Connect to CI.
-- [ ] **Working On**  0.8.0 Comprehensive model eval **debug/verify**. Each model should succeed at at least a few known environments. Also, massive refactoring will be needed.
-- [ ] 0.9.0 Notebook demonstrations of basic model usage.
+- [X] 0.8.0 Comprehensive model eval **debug/verify**. Each model should succeed at at least a few known environments. Also, massive refactoring will be needed.
+- [ ] **Working on** 0.9.0 Notebook demonstrations of basic model usage.
 - [ ] **1.0.0** Base version is completed with working model visualizations proving performance / expected failure. At 
 this point, all models should have guaranteed environments they should succeed in. 
 - [ ] 1.2.0 Add PyBullet Fetch Environments
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 7db0446..f691bc4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -30,7 +30,7 @@ steps:
   displayName: 'Install Python Packages'
 
 - script: |
-    xvfb-run -s "-screen 0 1400x900x24" pytest -n 8 fast_rl/tests --doctest-modules --junitxml=junit/test-results.xml --cov=./ --cov-report=xml --cov-report=html
+    xvfb-run -s "-screen 0 1400x900x24" pytest -n 2 fast_rl/tests --doctest-modules --junitxml=junit/test-results.xml --cov=./ --cov-report=xml --cov-report=html
   displayName: 'Test with pytest'
 
 - task: PublishTestResults@2
diff --git a/docs_src/rl.agents.dqnfixedtarget.ipynb b/docs_src/rl.agents.dqnfixedtarget.ipynb
index dd7e837..e3b1a7f 100644
--- a/docs_src/rl.agents.dqnfixedtarget.ipynb
+++ b/docs_src/rl.agents.dqnfixedtarget.ipynb
@@ -15,10 +15,10 @@
     "import fast_rl.agents.DQN \n",
     "from fast_rl.agents.DQN import DQN, FixedTargetDQN, DoubleDQN, DuelingDQN, DoubleDuelingDQN\n",
     "from fast_rl.core.Interpreter import AgentInterpretationAlpha\n",
-    "from fast_rl.core.Learner import AgentLearner\n",
-    "from fast_rl.core.MarkovDecisionProcess import MDPDataBunch\n",
+    "from fast_rl.core.Learner import AgentLearnerAlpha\n",
+    "from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha\n",
     "from fast_rl.core.agent_core import PriorityExperienceReplay, ExperienceReplay\n",
-    "from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_IMAGE, FEED_TYPE_STATE\n",
+    "from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha, FEED_TYPE_IMAGE, FEED_TYPE_STATE\n",
     "from fast_rl.core.agent_core import ExperienceReplay, GreedyEpsilon\n",
     "import sys\n",
     "import importlib"
@@ -423,12 +423,12 @@
     }
    ],
    "source": [
-    "data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000)\n",
+    "data = MDPDataBunchAlpha.from_env('maze-random-5x5-v0', render='human', max_steps=1000)\n",
     "model = FixedTargetDQN(data, batch_size=128, max_episodes=50, lr=0.001, copy_over_frequency=3,\n",
     "                       memory=ExperienceReplay(10000), discount=0.99, \n",
     "                      exploration_strategy=GreedyEpsilon(epsilon_start=1, epsilon_end=0.1,\n",
     "                                                         decay=0.001, do_exploration=True))\n",
-    "learn = AgentLearner(data, model)\n",
+    "learn = AgentLearnerAlpha(data, model)\n",
     "\n",
     "learn.fit(50)"
    ]
diff --git a/docs_src/rl.core.mdp_interpreter.ipynb b/docs_src/rl.core.mdp_interpreter.ipynb
index c7e38aa..315ead8 100644
--- a/docs_src/rl.core.mdp_interpreter.ipynb
+++ b/docs_src/rl.core.mdp_interpreter.ipynb
@@ -79,14 +79,14 @@
     "import numpy as np\n",
     "\n",
     "from fast_rl.agents.DQN import DQN\n",
-    "from fast_rl.core.Learner import AgentLearner\n",
-    "from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, MDPDataset\n",
+    "from fast_rl.core.Learner import AgentLearnerAlpha\n",
+    "from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha, MDPDatasetAlpha\n",
     "from fast_rl.core.Interpreter import AgentInterpretationAlpha\n",
     "%matplotlib inline\n",
     "    \n",
-    "data = MDPDataBunch.from_env('CartPole-v1', render='human', bs=64)\n",
+    "data = MDPDataBunchAlpha.from_env('CartPole-v1', render='human', bs=64)\n",
     "model = DQN(data)\n",
-    "learn = AgentLearner(data, model)\n",
+    "learn = AgentLearnerAlpha(data, model)\n",
     "\n",
     "learn.fit(5)"
    ]
@@ -106,9 +106,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = MDPDataBunch.from_pickle('CartPole-v1', render='human', bs=64)\n",
+    "data = MDPDataBunchAlpha.from_pickle('CartPole-v1', render='human', bs=64)\n",
     "model = DQN(data)\n",
-    "learn = AgentLearner(data, model)"
+    "learn = AgentLearnerAlpha(data, model)"
    ]
   },
   {
diff --git a/fast_rl/agents/BaseAgent.py b/fast_rl/agents/BaseAgent.py
index d6a7a28..f577e7e 100644
--- a/fast_rl/agents/BaseAgent.py
+++ b/fast_rl/agents/BaseAgent.py
@@ -1,15 +1,12 @@
 from math import floor
+from typing import Collection
 
-import gym
+import numpy as np
 import torch
-from fastai.basic_train import LearnerCallback, Any
-from fastai.callback import Callback
+from fastai.basic_train import LearnerCallback
 from fastai.layers import bn_drop_lin
 from gym.spaces import Discrete, Box
 from torch import nn
-from traitlets import List
-import numpy as np
-from typing import Collection
 
 from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
 from fast_rl.core.agent_core import ExplorationStrategy
@@ -29,6 +26,7 @@ def __init__(self, data: MDPDataBunch):
         self.loss = None
         self.out = None
         self.opt = None
+        self.warming_up = False
         self.learner_callbacks = []  # type: Collection[LearnerCallback]
         # Root model that will be accessed for action decisions
         self.action_model = None  # type: nn.Module
@@ -77,39 +75,6 @@ def forward(self, x):
         return x.view(x.size(0), -1)
 
 
-
-
-def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=False,
-                    activation_function=None, final_activation_function=None, action_val_to_dim=True):
-    """Generates an nn module.
-
-    Notes:
-        TabularModel could possibly be used along side a cnn learner instead. Will be a good idea to investigate.
-
-    Returns:
-
-    """
-    act = nn.LeakyReLU if activation_function is None else activation_function
-    # For now the dimension of the action does not make a difference.
-    action_size = action_size[0] if not action_val_to_dim else action_size[1]
-    # For now keep drop out as 0, test including dropout later
-    ps = [0] * len(layer_list)
-    sizes = [state_size] + layer_list + [action_size]
-    actns = [act() for _ in range(len(sizes) - 2)] + [None]
-    layers = []
-    for i, (n_in, n_out, dp, act) in enumerate(zip(sizes[:-1], sizes[1:], [0.] + ps, actns)):
-        if i == 0 and use_embed:
-            embedded, n_in = get_embedded(n_in[0], n_out, n_in[1], 5)
-            layers += [ToLong(), embedded, Flatten()]
-        elif i == 0: n_in = n_in[0]
-        if i == 0 and use_bn: layers += [nn.BatchNorm1d(n_in)]
-
-        layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act)
-
-    if final_activation_function is not None: layers += [final_activation_function()]
-    return nn.Sequential(*layers)
-
-
 def get_next_conv_shape(c_w, c_h, stride, kernel_size):
     h = floor((c_h - kernel_size - 2) / stride) + 1 # 3 convolutional layers given (3c, 640w, 640h)
     w = floor((c_w - kernel_size - 2) / stride) + 1
@@ -117,7 +82,7 @@ def get_next_conv_shape(c_w, c_h, stride, kernel_size):
 
 
 def get_conv(input_tuple, act, kernel_size, stride, n_conv_layers, layers):
-    """
+    r"""
     Useful guideline for convolutional net shape change:
 
     Shape:
@@ -141,37 +106,12 @@ def get_conv(input_tuple, act, kernel_size, stride, n_conv_layers, layers):
         n_conv_layers:
         layers:
     """
-    h, w = input_tuple[0], input_tuple[1]
+    h, w = input_tuple[1], input_tuple[2]
     conv_layers = [SwapImageChannel()]
     for i in range(n_conv_layers):
         h, w = get_next_conv_shape(h, w, stride, kernel_size)
-        conv_layers.append(torch.nn.Conv2d(input_tuple[2], 3, kernel_size=kernel_size, stride=stride))
+        conv_layers.append(torch.nn.Conv2d(input_tuple[3], 3, kernel_size=kernel_size, stride=stride))
         conv_layers.append(act)
-    return layers + conv_layers, 3 * (h + 1) * (w + 1)
-
 
-def create_cnn_model(layer_list: list, action_size, state_size, use_bn=False, kernel_size=5, stride=3, n_conv_layers=3,
-                     activation_function=None, final_activation_function=None, action_val_to_dim=True):
-    """Generates an nn module.
-
-    Notes:
-        TabularModel could possibly be used along side a cnn learner instead. Will be a good idea to investigate.
-
-    Returns:
-
-    """
-    act = nn.LeakyReLU if activation_function is None else activation_function
-    # For now keep drop out as 0, test including dropout later
-    ps = [0] * len(layer_list)
-    action_size = action_size[0] if not action_val_to_dim else action_size[1]
-    sizes = [state_size[0]] + layer_list + [action_size]
-    actns = [act() for _ in range(n_conv_layers + len(sizes) - 2)] + [None]
-    layers = []
-    for i, (n_in, n_out, dp, act) in enumerate(zip(sizes[:-1], sizes[1:], [0.] + ps, actns)):
-        if type(n_in) == tuple:
-            layers, n_in = get_conv(n_in, act, kernel_size, n_conv_layers=n_conv_layers, layers=layers, stride=stride)
-            layers += [Flatten()]
-
-        layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act)
-    if final_activation_function is not None: layers += [final_activation_function()]
-    return nn.Sequential(*layers)
\ No newline at end of file
+    output_size = torch.prod(torch.tensor(nn.Sequential(*(layers + conv_layers))(torch.rand(input_tuple)).shape))
+    return layers + conv_layers, output_size
diff --git a/fast_rl/agents/DDPG.py b/fast_rl/agents/DDPG.py
index 1f591ca..ac523b1 100644
--- a/fast_rl/agents/DDPG.py
+++ b/fast_rl/agents/DDPG.py
@@ -1,18 +1,28 @@
 from copy import deepcopy
 
-import torch
-from fastai.basic_train import LearnerCallback, Any, OptimWrapper, ifnone, F
 import numpy as np
-from fastai.metrics import RMSE
+import torch
+from fastai.basic_train import LearnerCallback, Any, OptimWrapper, ifnone
 from torch import nn
 from torch.nn import MSELoss
 from torch.optim import Adam
 
-from fast_rl.agents.BaseAgent import BaseAgent, create_nn_model, create_cnn_model, get_next_conv_shape, get_conv, \
+from fast_rl.agents.BaseAgent import BaseAgent, get_conv, \
     Flatten
-from fast_rl.core.Learner import AgentLearner
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-from fast_rl.core.agent_core import GreedyEpsilon, ExperienceReplay
+from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, Action, State
+from fast_rl.core.agent_core import ExperienceReplay, OrnsteinUhlenbeck
+
+
+def get_action_ddpg_cnn(layers, action: Action, state: State, activation=nn.ReLU, kernel_size=5, stride=2):
+    module_layers, out_size = get_conv(state.s.shape, activation(), kernel_size=kernel_size, stride=stride,
+                                       n_conv_layers=3, layers=[])
+    module_layers += [Flatten()]
+    layers.append(action.taken_action.shape[1])
+    for i, layer in enumerate(layers):
+        module_layers += [nn.Linear(out_size, layer)] if i == 0 else [nn.Linear(layers[i - 1], layer)]
+        module_layers += [activation()]
+
+    return nn.Sequential(*module_layers)
 
 
 class BaseDDPGCallback(LearnerCallback):
@@ -28,8 +38,6 @@ def on_train_begin(self, n_epochs, **kwargs: Any):
 
     def on_epoch_begin(self, epoch, **kwargs: Any):
         self.episode = epoch
-        # if self.learn.model.training and self.iteration != 0:
-        #     self.learn.model.memory.update(item=self.learn.data.x.items[-1])
         self.iteration = 0
 
     def on_loss_begin(self, **kwargs: Any):
@@ -41,21 +49,42 @@ def on_loss_begin(self, **kwargs: Any):
         post_optimize = self.learn.model.optimize()
         if self.learn.model.training:
             self.learn.model.memory.refresh(post_optimize=post_optimize)
-            # if self.iteration % self.copy_over_frequency == 0:
             self.learn.model.target_copy_over()
             self.iteration += 1
-    #
-    # def on_epoch_end(self, **kwargs: Any):
-    #     if self.episode % self.copy_over_frequency == 0:
-    #         self.learn.model.target_copy_over()
+
+
+class NNActor(nn.Module):
+    def __init__(self, layers, action: Action, state: State, activation=nn.ReLU, embed=False):
+        super().__init__()
+        layers += [action.taken_action.shape[1]]
+        module_layers = []
+
+        for i, layer in enumerate(layers):
+            module_layers.append(nn.Linear(state.s.shape[1] if i == 0 else layers[i - 1], layer))
+            if i != len(layers) - 1: module_layers.append(activation())
+
+        module_layers += [nn.Tanh()]
+        self.model = nn.Sequential(*module_layers)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class CNNActor(nn.Module):
+    def __init__(self, layers, action: Action, state: State, activation=nn.ReLU):
+        super().__init__()
+        # This is still some complete overlap in nn builders, for here, the default function has everything we need
+        self.model = get_action_ddpg_cnn(layers, action, state, activation=activation, kernel_size=5, stride=2)
+
+    def forward(self, x):
+        return self.model(x)
 
 
 class NNCritic(nn.Module):
-    def __init__(self, layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
-                 activation_function=None):
+    def __init__(self, layer_list: list, action: Action, state: State):
         super().__init__()
-        self.action_size = action_size[0]
-        self.state_size = state_size[0]
+        self.action_size = action.taken_action.shape[1]
+        self.state_size = state.s.shape[1]
 
         self.fc1 = nn.Linear(self.state_size, layer_list[0])
         self.fc2 = nn.Linear(layer_list[0] + self.action_size, layer_list[1])
@@ -72,10 +101,10 @@ def forward(self, x):
 
 
 class CNNCritic(nn.Module):
-    def __init__(self, layer_list: list, action_size, state_size, activation_function=None):
+    def __init__(self, action: Action, state: State):
         super().__init__()
-        self.action_size = action_size[0]
-        self.state_size = state_size[0]
+        self.action_size = action.taken_action.shape[1]
+        self.state_size = state.s.shape
 
         layers = []
         layers, input_size = get_conv(self.state_size, nn.LeakyReLU(), 8, 2, 3, layers)
@@ -97,15 +126,15 @@ def forward(self, x):
 
 class DDPG(BaseAgent):
 
-    def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount=0.99,
+    def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, discount=0.99,
                  lr=1e-3, actor_lr=1e-4, exploration_strategy=None):
         """
-        Implementation of a continuous control algorithm using an actor/critic architecture.
+        Implementation of a discrete control algorithm using an actor/critic architecture.
 
         Notes:
             Uses 4 networks, 2 actors, 2 critics.
             All models use batch norm for feature invariance.
-            NNCritic simply predicts Q while the Actor proposes the actions to take given a state s.
+            NNCritic simply predicts Q while the Actor proposes the actions to take given a s s.
 
         References:
             [1] Lillicrap, Timothy P., et al. "Continuous control with deep reinforcement learning."
@@ -115,7 +144,6 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount
             data: Primary data object to use.
             memory: How big the memory buffer will be for offline training.
             tau: Defines how "soft/hard" we will copy the target networks over to the primary networks.
-            batch: Size of per memory query.
             discount: Determines the amount of discounting the existing Q reward.
             lr: Rate that the opt will learn parameter gradients.
         """
@@ -123,8 +151,9 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount
         self.name = 'DDPG'
         self.lr = lr
         self.discount = discount
-        self.batch = batch
         self.tau = 1
+        self.warming_up = True
+        self.batch_size = data.train_ds.bs
         self.memory = ifnone(memory, ExperienceReplay(10000))
 
         self.action_model = self.initialize_action_model([400, 300], data)
@@ -143,28 +172,23 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount
 
         self.loss_func = MSELoss()
 
-        self.exploration_strategy = ifnone(exploration_strategy, GreedyEpsilon(epsilon_start=1, epsilon_end=0.1,
-                                                                               decay=0.001,
-                                                                               do_exploration=self.training))
+        self.exploration_strategy = ifnone(exploration_strategy, OrnsteinUhlenbeck(size=data.action.taken_action.shape,
+                                                                                   epsilon_start=1, epsilon_end=0.1,
+                                                                                   decay=0.001,
+                                                                                   do_exploration=self.training))
 
     def initialize_action_model(self, layers, data):
-        actions, state = data.get_action_state_size()
-        if type(state[0]) is tuple and len(state[0]) == 3:
-            # actions, state = actions[0], state[0]
-            # If the shape has 3 dimensions, we will try using cnn's instead.
-            return create_cnn_model([200, 200], actions, state, False, kernel_size=8,
-                                    final_activation_function=nn.Tanh, action_val_to_dim=False)
+        if len(data.state.s.shape) == 4 and data.state.s.shape[-1] < 4:
+            return CNNActor(layers, data.action, data.state)
         else:
-            return create_nn_model(layers, *data.get_action_state_size(), False, use_embed=data.train_ds.embeddable,
-                                   final_activation_function=nn.Tanh, action_val_to_dim=False)
+            return NNActor(layers, data.action, data.state)
 
     def initialize_critic_model(self, layers, data):
-        """ Instead of state -> action, we are going state + action -> single expected reward. """
-        actions, state = data.get_action_state_size()
-        if type(state[0]) is tuple and len(state[0]) == 3:
-            return CNNCritic(layers, *data.get_action_state_size())
+        """ Instead of s -> action, we are going s + action -> single expected reward. """
+        if len(data.state.s.shape) == 4 and data.state.s.shape[-1] < 4:
+            return CNNCritic(data.action, data.state)
         else:
-            return NNCritic(layers, *data.get_action_state_size())
+            return NNCritic(layers, data.action, data.state)
 
     def pick_action(self, x):
         if self.training: self.action_model.eval()
@@ -174,7 +198,7 @@ def pick_action(self, x):
         return np.clip(action, -1, 1)
 
     def optimize(self):
-        """
+        r"""
         Performs separate updates to the actor and critic models.
 
         Get the predicted yi for optimizing the actor:
@@ -187,14 +211,17 @@ def optimize(self):
         Returns:
 
         """
-        if len(self.memory) > self.batch:
+        if len(self.memory) > self.batch_size:
+            self.warming_up = False
             # Perhaps have memory as another item list? Should investigate.
-            sampled = self.memory.sample(self.batch)
+            sampled = self.memory.sample(self.batch_size)
 
-            r = torch.from_numpy(np.array([item.reward for item in sampled]).astype(float)).float()
-            s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float()
-            s = torch.from_numpy(np.array([item.current_state for item in sampled])).float()
-            a = torch.from_numpy(np.array([item.actions for item in sampled]).astype(float)).float()
+            with torch.no_grad():
+                r = torch.cat([item.reward.float() for item in sampled])
+                s_prime = torch.cat([item.s_prime.float() for item in sampled])
+                s = torch.cat([item.s.float() for item in sampled])
+                a = torch.cat([item.a.float() for item in sampled])
+                # d = torch.cat([item.done.float() for item in sampled]) # Do we need a mask??
 
             with torch.no_grad():
                 y = r + self.discount * self.t_critic_model((s_prime, self.t_action_model(s_prime)))
@@ -234,7 +261,7 @@ def target_copy_over(self):
 
     def soft_target_copy_over(self, t_m, f_m, tau):
         for target_param, local_param in zip(t_m.parameters(), f_m.parameters()):
-            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
+            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
 
     def interpret_q(self, items):
         with torch.no_grad():
diff --git a/fast_rl/agents/DQN.py b/fast_rl/agents/DQN.py
index 462bc62..c6edd2f 100644
--- a/fast_rl/agents/DQN.py
+++ b/fast_rl/agents/DQN.py
@@ -1,20 +1,19 @@
-import copy
 from copy import deepcopy
 from functools import partial
+from typing import Tuple
 
-import numpy as np
 import torch
 from fastai.basic_train import LearnerCallback, Any, F, OptimWrapper, ifnone
 from torch import optim, nn
 
-from fast_rl.agents.BaseAgent import BaseAgent, create_nn_model, create_cnn_model, ToLong, get_embedded, Flatten
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_IMAGE
+from fast_rl.agents.BaseAgent import BaseAgent, ToLong, get_embedded, Flatten, get_conv
+from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, MDPDataset, State, Action
 from fast_rl.core.agent_core import ExperienceReplay, GreedyEpsilon
 
 
 class BaseDQNCallback(LearnerCallback):
     def __init__(self, learn, max_episodes=None):
-        """Handles basic DQN end of step model optimization."""
+        r"""Handles basic DQN end of step model optimization."""
         super().__init__(learn)
         self.n_skipped = 0
         self._persist = max_episodes is not None
@@ -33,11 +32,8 @@ def on_epoch_begin(self, epoch, **kwargs: Any):
         self.iteration = 0
 
     def on_loss_begin(self, **kwargs: Any):
-        """Performs memory updates, exploration updates, and model optimization."""
-        if self.learn.model.training and self.previous_item is not None:
-            if self.learn.data.x.items[-2].done: self.previous_item.done = self.learn.data.x.items[-2].done
-            self.learn.model.memory.update(item=self.previous_item)
-        self.previous_item = copy.deepcopy(self.learn.data.x.items[-1])
+        r"""Performs memory updates, exploration updates, and model optimization."""
+        if self.learn.model.training: self.learn.model.memory.update(item=self.learn.data.train_ds.x.items[-1])
         self.learn.model.exploration_strategy.update(self.episode, max_episodes=self.max_episodes,
                                                      do_exploration=self.learn.model.training)
         post_optimize = self.learn.model.optimize()
@@ -45,8 +41,6 @@ def on_loss_begin(self, **kwargs: Any):
         self.iteration += 1
 
 
-
-
 class FixedTargetDQNCallback(LearnerCallback):
     def __init__(self, learn, copy_over_frequency=3):
         """Handles updating the target model in a fixed target DQN.
@@ -66,32 +60,38 @@ def on_step_end(self, **kwargs: Any):
             self.learn.model.target_copy_over()
 
 
-class DQNActionNN(nn.Module):
-    def __init__(self, layers, action, state, activation=nn.ReLU, embed=False):
-        super().__init__()
-
-        module_layers = []
-        for i, size in enumerate(layers):
-            if i == 0:
-                if embed:
-                    embedded, out = get_embedded(state[0], size, state[1], 5)
-                    module_layers += [ToLong(), embedded, Flatten(), nn.Linear(out, size)]
-                else:
-                    module_layers.append(nn.Linear(state[0], size))
+def get_action_dqn_fully_conn(layers, action: Action, state: State, activation=nn.ReLU, embed=False):
+    module_layers = []
+    for i, size in enumerate(layers):
+        if i == 0:
+            if embed:
+                embedded, out = get_embedded(state.s.shape[1], size, state.n_possible_values, 5)
+                module_layers += [ToLong(), embedded, Flatten(), nn.Linear(out, size)]
             else:
-                module_layers.append(nn.Linear(layers[i-1], size))
-            module_layers.append(activation())
+                module_layers.append(nn.Linear(state.s.shape[1], size))
+        else:
+            module_layers.append(nn.Linear(layers[i - 1], size))
+        module_layers.append(activation())
+
+    module_layers.append(nn.Linear(layers[-1], action.n_possible_values))
+    return nn.Sequential(*module_layers)
 
-        module_layers.append(nn.Linear(layers[-1], action[1]))
-        self.model = nn.Sequential(*module_layers)
 
-    def forward(self, x, **kwargs: Any):
-        return self.model(x)
+def get_action_dqn_cnn(layers, action: Action, state: State, activation=nn.ReLU, kernel_size=5, stride=2):
+    module_layers, out_size = get_conv(state.s.shape, activation(), kernel_size=kernel_size, stride=stride,
+                                       n_conv_layers=3, layers=[])
+    module_layers += [Flatten()]
+    layers.append(action.n_possible_values)
+    for i, layer in enumerate(layers):
+        module_layers += [nn.Linear(out_size, layer)] if i == 0 else [nn.Linear(layers[i - 1], layer)]
+        if i != len(layers) - 1: module_layers += [activation()]
+
+    return nn.Sequential(*module_layers)
 
 
 class DQN(BaseAgent):
-    def __init__(self, data: MDPDataBunch, memory=None, batch_size=32, lr=0.01, discount=0.95, grad_clip=5,
-                 max_episodes=None, exploration_strategy=None, use_embeddings=False):
+    def __init__(self, data: MDPDataBunch, memory=None, lr=0.01, discount=0.95, grad_clip=5,
+                 max_episodes=None, exploration_strategy=None, use_embeddings=False, layers=None):
         """Trains an Agent using the Q Learning method on a neural net.
 
         Notes:
@@ -105,16 +105,17 @@ def __init__(self, data: MDPDataBunch, memory=None, batch_size=32, lr=0.01, disc
             data: Used for size input / output information.
         """
         super().__init__(data)
-        # TODO add recommend cnn based on state size?
+        # TODO add recommend cnn based on s size?
         self.name = 'DQN'
         self.use_embeddings = use_embeddings
-        self.batch_size = batch_size
+        self.batch_size = data.train_ds.bs
         self.discount = discount
+        self.warming_up = True
         self.lr = lr
         self.gradient_clipping_norm = grad_clip
-        self.loss_func = F.mse_loss #F.smooth_l1_loss
+        self.loss_func = F.mse_loss
         self.memory = ifnone(memory, ExperienceReplay(10000))
-        self.action_model = self.initialize_action_model([24, 24], data)
+        self.action_model = self.initialize_action_model(ifnone(layers, [24, 24]), data.train_ds)
         self.opt = OptimWrapper.create(optim.Adam, lr=self.lr, layer_groups=[self.action_model])
         self.learner_callbacks += [partial(BaseDQNCallback, max_episodes=max_episodes)] + self.memory.callbacks
         self.exploration_strategy = ifnone(exploration_strategy, GreedyEpsilon(epsilon_start=1, epsilon_end=0.1,
@@ -127,10 +128,9 @@ def init_weights(self, m):
             m.bias.data.fill_(0.01)
 
     def initialize_action_model(self, layers, data):
-        # if self.data.train_ds.feed_type == FEED_TYPE_IMAGE: model = create_cnn_model(layers, *data.get_action_state_size(), action_val_to_dim=True)
-        # else: model = create_nn_model(layers, *data.get_action_state_size(), use_embed=False, action_val_to_dim=True)
-        model = DQNActionNN(layers, *data.get_action_state_size(), embed=self.use_embeddings)  # type: nn.Module
-
+        if len(data.state.s.shape) == 4 and data.state.s.shape[-1] < 4:
+            model = get_action_dqn_cnn(deepcopy(layers), data.action, data.state, kernel_size=5, stride=2)
+        else: model = get_action_dqn_fully_conn(deepcopy(layers), data.action, data.state, embed=self.use_embeddings)
         model.apply(self.init_weights)
         return model
 
@@ -138,8 +138,27 @@ def forward(self, x):
         x = super(DQN, self).forward(x)
         return self.action_model(x)
 
+    def sample_mask(self) -> Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
+        self.warming_up = False
+        # Perhaps have memory as another itemlist? Should investigate.
+        sampled = self.memory.sample(self.batch_size)
+        with torch.no_grad():
+            r = torch.cat([item.reward.float() for item in sampled])
+            s_prime = torch.cat([item.s_prime.float() for item in sampled])
+            s = torch.cat([item.s.float() for item in sampled])
+            a = torch.cat([item.a.long() for item in sampled])
+            d = torch.cat([item.done.float() for item in sampled])
+
+        masking = torch.sub(1.0, d)
+        return r, s_prime, s, a, d, masking
+
+    def calc_y_hat(self, s, a): return self.action_model(s).gather(1, a)
+
+    def calc_y(self, s_prime, masking, r, y_hat):
+        return self.discount * self.action_model(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat)
+
     def optimize(self):
-        """Uses ER to optimize the Q-net (without fixed targets).
+        r"""Uses ER to optimize the Q-net (without fixed targets).
         
         Uses the equation:
 
@@ -149,22 +168,15 @@ def optimize(self):
 
         
         Returns (dict): Optimization information
+
         """
         if len(self.memory) > self.batch_size:
-            # Perhaps have memory as another itemlist? Should investigate.
-            sampled = self.memory.sample(self.batch_size)
-            with torch.no_grad():
-                r = torch.from_numpy(np.array([item.reward for item in sampled])).float()
-                s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float()
-                s = torch.from_numpy(np.array([item.current_state for item in sampled])).float()
-                a = torch.from_numpy(np.array([item.actions for item in sampled])).long()
-                d = torch.from_numpy(np.array([item.done for item in sampled])).float()
+            r, s_prime, s, a, d, masking = self.sample_mask()
 
-            masking = torch.sub(1.0, d).unsqueeze(0)
             # Traditional `maze-random-5x5-v0` with have a model output a Nx4 output.
             # since r is just Nx1, we spread the reward into the actions.
-            y_hat = self.action_model(s).gather(1, a)
-            y = self.discount * self.action_model(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat)
+            y_hat = self.calc_y_hat(s, a)
+            y = self.calc_y(s_prime, masking, r, y_hat)
 
             loss = self.loss_func(y, y_hat)
 
@@ -183,17 +195,14 @@ def optimize(self):
 
     def interpret_q(self, items):
         with torch.no_grad():
-            r = torch.from_numpy(np.array([item.reward for item in items])).float()
-            s_prime = torch.from_numpy(np.array([item.result_state for item in items])).float()
-            s = torch.from_numpy(np.array([item.current_state for item in items])).float()
-            a = torch.from_numpy(np.array([item.actions for item in items])).long()
-
+            s = torch.cat([item.s.float() for item in items])
+            a = torch.cat([item.a.long() for item in items])
             return self.action_model(s).gather(1, a)
 
 
 class FixedTargetDQN(DQN):
-    def __init__(self, data: MDPDataBunch, memory=None, tau=0.01, copy_over_frequency=3,  **kwargs):
-        """Trains an Agent using the Q Learning method on a 2 neural nets.
+    def __init__(self, data: MDPDataBunch, memory=None, tau=0.01, copy_over_frequency=3, **kwargs):
+        r"""Trains an Agent using the Q Learning method on a 2 neural nets.
 
         Notes:
             Unlike the base DQN, this is a true reflection of ref [1]. We use 2 models instead of one to allow for
@@ -213,60 +222,26 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=0.01, copy_over_frequenc
         self.learner_callbacks += [partial(FixedTargetDQNCallback, copy_over_frequency=copy_over_frequency)]
 
     def target_copy_over(self):
-        """ Updates the target network from calls in the FixedTargetDQNCallback callback."""
+        r""" Updates the target network from calls in the FixedTargetDQNCallback callback."""
         # self.target_net.load_state_dict(self.action_model.state_dict())
         for target_param, local_param in zip(self.target_net.parameters(), self.action_model.parameters()):
             target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
 
-    def optimize(self):
-        """Uses ER to optimize the Q-net.
-
+    def calc_y(self, s_prime, masking, r, y_hat):
+        r"""
         Uses the equation:
 
         .. math::
                 Q^{*}(s, a) = \mathbb{E}_{s'∼ \Big\epsilon} \Big[r + \lambda \displaystyle\max_{a'}(Q^{*}(s' , a'))
                 \;|\; s, a \Big]
 
-
-        Returns (dict): Optimization information
         """
-        if len(self.memory) > self.batch_size:
-            # Perhaps have memory as another item list? Should investigate.
-            sampled = self.memory.sample(self.batch_size)
-
-            with torch.no_grad():
-                r = torch.from_numpy(np.array([item.reward for item in sampled])).float()
-                s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float()
-                s = torch.from_numpy(np.array([item.current_state for item in sampled])).float()
-                a = torch.from_numpy(np.array([item.actions for item in sampled])).long()
-                d = torch.from_numpy(np.array([item.done for item in sampled])).float()
-
-            # Traditional `maze-random-5x5-v0` with have a model output a Nx4 output.
-            # since r is just Nx1, we spread the reward into the actions.
-            y_hat = self.action_model(s).gather(1, a)
-
-            masking = torch.sub(1.0, d).unsqueeze(1)
-            y = self.discount * self.target_net(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat)
-
-            loss = self.loss_func(y, y_hat)
-            self.loss = loss.cpu().detach()
-
-            if self.training:
-                self.opt.zero_grad()
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(self.action_model.parameters(), self.gradient_clipping_norm)
-                for param in self.action_model.parameters():
-                    param.grad.data.clamp_(-1, 1)
-                self.opt.step()
-
-            with torch.no_grad():
-                post_info = {'td_error': (y - y_hat).numpy()}
-                return post_info
+        return self.discount * self.target_net(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat)
 
 
 class DoubleDQN(FixedTargetDQN):
     def __init__(self, data: MDPDataBunch, memory=None, copy_over_frequency=3, **kwargs):
-        """
+        r"""
         Double DQN training.
 
         References:
@@ -279,57 +254,20 @@ def __init__(self, data: MDPDataBunch, memory=None, copy_over_frequency=3, **kwa
         super().__init__(data=data, memory=memory, copy_over_frequency=copy_over_frequency, **kwargs)
         self.name = 'DDQN'
 
-    def optimize(self):
-        """Uses ER to optimize the Q-net.
-
-        Uses the equation:
-
-        .. math::
-                Q^{*}(s, a) = \mathbb{E}_{s'∼ \Big\epsilon} \Big[r + \lambda \displaystyle\max_{}(Q^{*}(s' , \
-                argmax_{a'}(Q(s', \Theta)), \Theta^{-})) \;|\; s, a \Big]
-
-        Returns (dict): Optimization information
-        """
-        if len(self.memory) > self.batch_size:
-            # Perhaps have memory as another itemlist? Should investigate.
-            sampled = self.memory.sample(self.batch_size)
-            with torch.no_grad():
-                r = torch.from_numpy(np.array([item.reward for item in sampled])).float()
-                s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float()
-                s = torch.from_numpy(np.array([item.current_state for item in sampled])).float()
-                a = torch.from_numpy(np.array([item.actions for item in sampled])).long()
-
-            # Traditional `maze-random-5x5-v0` with have a model output a Nx4 output.
-            # since r is just Nx1, we spread the reward into the actions.
-            y_hat = self.action_model(s).gather(1, a)
-            y = self.discount * self.target_net(s_prime).gather(1, self.action_model(s_prime).argmax(axis=1).unsqueeze(
-                1)) + r.expand_as(y_hat)
-
-            loss = self.loss_func(y, y_hat)
-            self.loss = loss
-
-            if self.training:
-                self.opt.zero_grad()
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(self.action_model.parameters(), self.gradient_clipping_norm)
-                for param in self.action_model.parameters():
-                    param.grad.data.clamp_(-1, 1)
-                self.opt.step()
-
-            with torch.no_grad():
-                post_info = {'td_error': (y - y_hat).numpy()}
-                return post_info
+    def calc_y(self, s_prime, masking, r, y_hat):
+        return self.discount * self.target_net(s_prime).gather(1, self.action_model(s_prime).argmax(axis=1).unsqueeze(
+            1)) * masking + r.expand_as(y_hat)
 
 
 class DuelingDQNModule(nn.Module):
-    def __init__(self, a_s, stream_input_size):
+    def __init__(self, action, stream_input_size):
         super().__init__()
 
-        self.val = create_nn_model([stream_input_size], (0, 1), (stream_input_size, 0))
-        self.adv = create_nn_model([stream_input_size], a_s[0], (stream_input_size, 0))
+        self.val = nn.Linear(stream_input_size, 1)
+        self.adv = nn.Linear(stream_input_size, action.n_possible_values)
 
     def forward(self, x):
-        """Splits the base neural net output into 2 streams to evaluate the advantage and values of the state space and
+        r"""Splits the base neural net output into 2 streams to evaluate the advantage and values of the s space and
         corresponding actions.
 
         .. math::
@@ -341,17 +279,14 @@ def forward(self, x):
 
         Returns:
         """
-        val = self.val(x)
-        adv = self.adv(x)
-
+        val, adv = self.val(x), self.adv(x)
         x = val.expand_as(adv) + (adv - adv.mean()).squeeze(0)
         return x
 
 
 class DuelingDQN(FixedTargetDQN):
     def __init__(self, data: MDPDataBunch, memory=None, **kwargs):
-        """Replaces the basic action model with a DuelingDQNModule which splits the basic model into 2 streams.
-
+        r"""Replaces the basic action model with a DuelingDQNModule which splits the basic model into 2 streams.
 
         References:
             [1] Wang, Ziyu, et al. "Dueling network architectures for deep reinforcement learning."
@@ -364,16 +299,14 @@ def __init__(self, data: MDPDataBunch, memory=None, **kwargs):
         self.name = 'Dueling DQN'
 
     def initialize_action_model(self, layers, data):
-        base = create_nn_model(layers, *data.get_action_state_size())[:-1]
-        a_s = data.get_action_state_size()
-        stream_input_size = base[-2].out_features
-        dueling_head = DuelingDQNModule(a_s=a_s, stream_input_size=stream_input_size)
+        base = super().initialize_action_model(layers, data)[:-2]
+        dueling_head = DuelingDQNModule(action=data.action, stream_input_size=base[-1].out_features)
         return nn.Sequential(base, dueling_head)
 
 
 class DoubleDuelingDQN(DoubleDQN, DuelingDQN):
     def __init__(self, data: MDPDataBunch, memory=None, **kwargs):
-        """
+        r"""
         Combines both Dueling DQN and DDQN.
 
         Args:
diff --git a/fast_rl/core/Envs.py b/fast_rl/core/Envs.py
index 3807856..79fbe07 100644
--- a/fast_rl/core/Envs.py
+++ b/fast_rl/core/Envs.py
@@ -22,15 +22,16 @@ def ban(envs: list):
         banned_envs = {
             'Defender': 'Defender (%s) seems to load for an extremely long time. Skipping for now. Determine cause.',
             'Fetch': 'Fetch (%s) envs are not ready yet.',
+            'Blackjack-v0': 'Blackjack (%s) does not have a render function... Skipping',
             'InvertedPendulumMuJoCoEnv': 'Mujoco Inverted Pendulum (%s) has a bug.',
-            'HopperMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
-            'InvertedDoublePendulumMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
-            'HalfCheetahMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
-            'HumanoidMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
-            'Walker2DMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
-            'AntMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
-            'AtlasPyBulletEnv': 'AtlasPyBulletEnv (%s) seems to load very slowly. Skipping for now.',
-            'MazeEnv': '(%s) Having a maze view issue.',
+            # 'HopperMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
+            # 'InvertedDoublePendulumMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
+            # 'HalfCheetahMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
+            # 'HumanoidMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
+            # 'Walker2DMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
+            # 'AntMuJoCoEnv': '(%s) Does not pass azure pipeline tests',
+            # 'AtlasPyBulletEnv': 'AtlasPyBulletEnv (%s) seems to load very slowly. Skipping for now.',
+            # 'MazeEnv': '(%s) Having a maze view issue.',
         }
         envs = np.array(envs)[list(map(partial(Envs._error_out, ban_list=banned_envs), envs))]
 
@@ -47,4 +48,4 @@ def get_all_envs(key=None, exclude_key=None):
     def get_all_latest_envs(key=None, exclude_key=None):
         all_envs = Envs.get_all_envs(key, exclude_key)
         roots = list(set(map(lambda x: str(x).split('-v')[0], all_envs)))
-        return list(set([sorted([env for env in all_envs if env.__contains__(root)])[-1] for root in roots]))
+        return list(set([sorted([env for env in all_envs if env.split('-v')[0] == root])[-1] for root in roots]))
diff --git a/fast_rl/core/Interpreter.py b/fast_rl/core/Interpreter.py
index 327716c..59e7866 100644
--- a/fast_rl/core/Interpreter.py
+++ b/fast_rl/core/Interpreter.py
@@ -17,7 +17,7 @@
 from torch import nn
 
 from fast_rl.core import Learner
-from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSlice, FEED_TYPE_IMAGE
+from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSliceAlpha, FEED_TYPE_IMAGE
 
 
 class AgentInterpretationAlpha(Interpretation):
@@ -55,11 +55,11 @@ def normalize(self, item: np.array):
     def top_losses(self, k: int = None, largest=True):
         raise NotImplementedError
 
-    def reward_heatmap(self, episode_slices: List[MarkovDecisionProcessSlice], action=None):
+    def reward_heatmap(self, episode_slices: List[MarkovDecisionProcessSliceAlpha], action=None):
         """
         Takes a state_space and uses the agent to heat map rewards over the space.
 
-        We first need to determine if the state space is discrete or continuous.
+        We first need to determine if the s space is discrete or discrete.
 
         Args:
             state_space:
@@ -86,7 +86,7 @@ def reward_heatmap(self, episode_slices: List[MarkovDecisionProcessSlice], actio
 
     def plot_heatmapped_episode(self, episode, fig_size=(13, 5), action_index=None, return_heat_maps=False):
         """
-        Generates plots of heatmapped state spaces for analyzing reward distribution.
+        Generates plots of heatmapped s spaces for analyzing reward distribution.
 
         Currently only makes sense for grid based envs. Will be expecting gym_maze environments that are discrete.
 
@@ -94,9 +94,9 @@ def plot_heatmapped_episode(self, episode, fig_size=(13, 5), action_index=None,
 
         """
         if not str(self.ds.env.spec).__contains__('maze'):
-            raise NotImplementedError('Currently only supports gym_maze envs that have discrete state spaces')
+            raise NotImplementedError('Currently only supports gym_maze envs that have discrete s spaces')
         if not isinstance(self.ds.state_size, Box):
-            raise NotImplementedError('Currently only supports Box based state spaces with 2 dimensions')
+            raise NotImplementedError('Currently only supports Box based s spaces with 2 dimensions')
 
         items = self._get_items()
         heat_maps = []
@@ -141,7 +141,7 @@ def plot_heatmapped_episode(self, episode, fig_size=(13, 5), action_index=None,
         if return_heat_maps: return heat_maps
 
     def plot_episode(self, episode):
-        items = self._get_items(False)  # type: List[MarkovDecisionProcessSlice]
+        items = self._get_items(False)  # type: List[MarkovDecisionProcessSliceAlpha]
 
         episode_counter = 0
         # For each episode
@@ -213,7 +213,7 @@ def plot_agent_accuracy_density(self, episode_num=None):
         Returns:
 
         """
-        items = self._get_items(False)  # type: List[MarkovDecisionProcessSlice]
+        items = self._get_items(False)  # type: List[MarkovDecisionProcessSliceAlpha]
         x, y = self.get_agent_accuracy_density(items, episode_num)
 
         fig = plt.figure(figsize=(8, 8))
@@ -250,7 +250,7 @@ def plot_q_density(self, episode_num=None):
         Returns:
 
         """
-        items = self._get_items(False)  # type: List[MarkovDecisionProcessSlice]
+        items = self._get_items(False)  # type: List[MarkovDecisionProcessSliceAlpha]
         x, y = self.get_q_density(items, episode_num)
 
         # Define the borders
diff --git a/fast_rl/core/Learner.py b/fast_rl/core/Learner.py
deleted file mode 100644
index 9d58097..0000000
--- a/fast_rl/core/Learner.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from functools import partial
-
-from fastai.basic_train import Recorder, defaults, Learner, listify, ifnone
-
-from fast_rl.agents.BaseAgent import BaseAgent
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, MDPMemoryManager
-from fast_rl.core.agent_core import fit
-from fast_rl.core.metrics import EpsilonMetric
-
-
-class AgentLearner(object):
-    silent: bool = None
-
-    def __init__(self, data: MDPDataBunch, model: BaseAgent, mem_strategy='k_partitions_both', k=1,
-                 metrics=None):
-        """
-        Will very soon subclass the fastai learner class. For now we need to understand the important functional
-        requirements needed in a learner.
-
-        """
-        self.model = model
-        self.data = data
-        self.metrics = []
-        self.opt = self.model.opt
-        if self.silent is None: self.silent = defaults.silent
-        self.add_time: bool = True
-        self.recorder = Recorder(learn=self, add_time=self.add_time, silent=self.silent)
-        self.recorder.no_val = self.data.empty_val
-        self.callbacks = ifnone(metrics, [])
-        self.callbacks += [partial(MDPMemoryManager, mem_strategy=mem_strategy, k=k)]
-        self.callbacks = [self.recorder] + [f(self) for f in self.callbacks] + [f(learn=self) for f in self.model.learner_callbacks]
-
-    def predict(self, element):
-        return self.model.pick_action(element)
-
-    def fit(self, epochs):
-        fit(epochs, self, self.callbacks, metrics=self.metrics)
diff --git a/fast_rl/core/MarkovDecisionProcess.py b/fast_rl/core/MarkovDecisionProcess.py
index 300aa58..3d7ce4f 100644
--- a/fast_rl/core/MarkovDecisionProcess.py
+++ b/fast_rl/core/MarkovDecisionProcess.py
@@ -1,8 +1,8 @@
-from numbers import Integral
-
 import gym
-from fastai.basic_train import LearnerCallback
-from gym.spaces import Discrete, Box
+from fastai.basic_train import LearnerCallback, DatasetType
+from gym.spaces import Discrete, Box, MultiDiscrete
+
+from fast_rl.util.exceptions import MaxEpisodeStepsMissingError
 
 try:
     # noinspection PyUnresolvedReferences
@@ -17,231 +17,393 @@
     print(f'Can\'t import one of these: {e}')
 
 from fastai.core import *
-# noinspection PyProtectedMember
-from fastai.data_block import ItemList, Tensor, Dataset, DataBunch, data_collate, DataLoader, PreProcessors
+from fastai.data_block import ItemList, Tensor, Dataset, DataBunch, DataLoader
 from fastai.imports import torch
-from fastai.vision import Image
-from gym import error
-from gym.envs.algorithmic.algorithmic_env import AlgorithmicEnv
-from gym.envs.toy_text import discrete
-from gym.wrappers import TimeLimit
 from datetime import datetime
 import pickle
 
-from fast_rl.util.misc import b_colors
+from fast_rl.util.misc import b_colors, list_in_str
 
 FEED_TYPE_IMAGE = 0
 FEED_TYPE_STATE = 1
 
 
-class MDPMemoryManager(LearnerCallback):
-    def __init__(self, learn, mem_strategy, k, max_episodes=None, episode=0, iteration=0):
+@dataclass
+class Bounds(object):
+    r"""
+    Handles bounds for 1 dimensional spaces.
+
+    between (gym.Space): A convenience variable for determining the min and max bounds
+
+    discrete (bool): Whether the Bounds are discrete or not. Important for N possible values calc.
+
+    min (list): Correlated min for a given dimension
+
+    max (list): Correlated max for a given dimension
+    """
+    between: Any = None
+    discrete: bool = False
+    min: list = None
+    max: list = None
+
+    def __len__(self):
+        r"""
+        Returns the number of dimensions the bounds have.
+
+        `self.min`'s length is returned because `self.max` and `self.min` as validated to have the same length.
+        """
+        return len(self.min)
+
+    @property
+    def n_possible_values(self):
         """
-        Handles different ways of memory management:
-        - (k_partitions_best)  keep partition best episodes
-        - (k_partitions_worst) keep partition worst episodes
-        - (k_partitions_both)  keep partition worst best episodes
-        - (k_top_best)         keep high fidelity k top episodes
-        - (k_top_worst)        keep k top worst
-        - (k_top_both)         keep k top worst and best
-        - (none)                keep none, only load into memory (always keep first)
-        - (all):               keep all steps will be kept (most memory inefficient)
+        Returns the maximum number of values that can be taken.
+
+        This is important for doing embeddings.
+        """
+        if not self.discrete: return np.inf
+        else: return np.prod(np.subtract(self.max, self.min))
+
+    def __post_init__(self):
+        """Sets min and max fields then validates them."""
+        self.min, self.max = ifnone(self.min, []), ifnone(self.max, [])
+        # If a tuple has been passed, break it into correlated min max variables.
+        if self.between is not None:
+            for b in (self.between if isinstance(self.between, gym.spaces.Tuple) else listify(self.between)):
+                if isinstance(b, (int, np.int64, np.int, float, np.float)):
+                    self.min, self.max = self.min + [0], self.max + [b]
+                    self.discrete = isinstance(b, (int, np.int, np.int64))
+                elif isinstance(b, Discrete):
+                    self.min, self.max, self.discrete = self.min + [0], self.max + [b.n], True
+                elif isinstance(b, MultiDiscrete):
+                    self.min, self.max, self.discrete = self.min + [0] * sum(b.nvec.shape), self.max + b.nvec, True
+                elif isinstance(b, Box):
+                    self.min, self.max = self.min + list(b.low), self.max + list(b.high)
+                    self.discrete = any([b.dtype in (int, np.int, np.int64)])
+                else:
+                    raise ValueError(f'Tuple not understood {self.between}')
+
+        if len(self.min) != len(self.max):
+            raise ValueError(f'Min Max do not match min {len(self.min)} max {len(self.max)}')
+        if len(self.min) == 0: raise ValueError(f'Min and Max are 0')
+
+
+@dataclass
+class Action(object):
+    r"""
+    Handles actions, action space, and value verification.
+
+    An important difference between taken_action and raw_action is that the raw action is the immediate
+    raw output from the model before any argmax processing.
+
+    taken_action (np.array): Expected to always to be numpy arrays with shape (-1, 1). This is the action that
+    is to be input into the env `step` function.
+
+    raw_action (np.array): Expected to always to be numpy arrays with shape (-1, 1). This is the raw model
+    output such as neural net final layer output. Can be None.
+
+    action_space (gym.Space): Used for estimating the max number of values. This is important for embeddings.
+
+    bounds (tuple): Maximum and minimum values for each action dimension.
+
+    n_possible_values (int): An integer or inf value indicating the total number of possible actions there are.
+    """
+    taken_action: torch.tensor
+    action_space: gym.Space
+    raw_action: torch.tensor = None
+    bounds: Bounds = None
+    n_possible_values: int = 0
+
+    def __post_init__(self):
+        # Determine bounds
+        self.bounds = Bounds(self.action_space)
+        self.n_possible_values = self.bounds.n_possible_values
+
+        # Fix shapes
+        self.taken_action = torch.tensor(data=self.taken_action).reshape(1, -1)
+        if self.raw_action is not None: self.raw_action = torch.tensor(data=self.raw_action).reshape(1, -1)
+
+    def get_single_action(self):
+        """ OpenAI envs do not like 1x1 arrays when they are expecting scalars, so we need to unwrap them. """
+        a = self.taken_action.detach().numpy()[0]
+        if len(self.bounds) == 1: a = a[0]
+        if self.bounds.discrete and len(self.bounds) == 1: return int(a)
+        elif self.bounds.discrete and len(self.bounds) != 1: return a.astype(int)
+        elif not self.bounds.discrete and len(self.bounds) == 1: return [float(a)]
+        elif not self.bounds.discrete and len(self.bounds) != 1: return a.reshape(-1,).astype(float)
+        raise ValueError(f'This should not have crashed.')
+
+    def set_single_action(self, action: np.array):
+        if np.isscalar(action):
+            self.taken_action = torch.tensor(data=action).reshape(1, -1)
+        elif len(action.shape) == 1:
+            self.taken_action = torch.tensor(data=action).reshape(1, -1)
+        elif len(action.shape) == 3 and action.shape[0] == 1:
+            self.taken_action = torch.tensor(data=action)[0]
+        self.taken_action = self.taken_action.int() if self.bounds.discrete else self.taken_action.float()
+
+
+@dataclass
+class State(object):
+    r"""
+    Handles states, both their main and alternate formats.
+
+    s (np.array): State space acquired from `env.reset` `s_prime` or `env.render`.
+
+    s_prime (np.array): State space acquired from `env.step` or `env.render`.
+
+    alt_s (np.array): Alternate State space acquired from `env.reset` `s_prime` or `env.render`. Should be an image.
+
+    alt_s_prime (np.array): Alternate State space acquired from `env.step` or `env.render`. Should be an image.
+
+    mode (int): Should be either FEED_TYPE_IMAGE or FEED_TYPE_STATE
+
+    observation_space (gym.Space): Used for estimating the max number of values. This is important for embeddings.
+
+    bounds (Bounds): Maximum and minimum values for each state dimension.
+
+    n_possible_values (int): An integer or inf value indicating the total number of possible actions there are.
+    """
+    s: torch.tensor
+    s_prime: torch.tensor
+    alt_s: Union[torch.tensor, np.array]
+    alt_s_prime: Union[torch.tensor, np.array]
+    observation_space: gym.Space
+    mode: int = FEED_TYPE_STATE
+    bounds: Bounds = None
+    n_possible_values: int = 0
+
+    def __str__(self):
+        out = copy(self.__dict__)
+        for key in out:
+            if out[key] is not None and (key == 's' or list_in_str(key, ['_s', 's_'])):  out[key] = out[key].shape
+
+        return f'State: ' + ', '.join([str(i) for i in out.items()])
+
+    def _fix_field(self, input_field):
+        if input_field is None: return None
+        input_field = copy(input_field)
+
+        if type(input_field) is str: input_field = np.array(input_field)
+        elif type(input_field) is tuple:
+            dtype = int if self.bounds.discrete else float
+            input_field = torch.tensor(data=np.array(input_field).reshape(1, -1).astype(dtype))
+        elif np.isscalar(input_field): input_field = torch.tensor(data=input_field)
+        elif type(input_field) is torch.Tensor: input_field = input_field.clone().detach()
+        else: input_field = torch.from_numpy(input_field)
+
+        # If a non-image state missing the batch dim
+        if len(input_field.shape) <= 1: return input_field.reshape(1, -1)
+        # If a non-image 2+d state missing the batch dim
+        elif input_field.shape[0] != 1 and len(input_field.shape) != 3: return input_field.reshape(1, -1)
+        # If an image state and missing the batch dim
+        elif input_field.shape[0] != 1 and len(input_field.shape) == 3: return input_field.unsqueeze(0)
+        # If an image with 4 dims (b, w, h, c), return safely
+        elif input_field.shape[0] == 1 and len(input_field.shape) == 4: return input_field
+        # If not an image, but has 2 dims (b, d), return safely
+        elif input_field.shape[0] == 1 and len(input_field.shape) == 2: return input_field
+        raise ValueError(f'Input has shape {input_field} for mode {self.mode}. This is unexpected.')
+
+    def __post_init__(self):
+        if self.mode not in [FEED_TYPE_IMAGE, FEED_TYPE_STATE]:
+            ValueError(f'Mode invalid {self.mode} not valid feed type')
+        # We want to swap the state variables if the alt is the image state
+        if self.mode == FEED_TYPE_IMAGE and len(self.alt_s.shape) > 2 and len(self.s.shape) != 3:
+            self.observation_space = gym.spaces.Box(0, 255, self.alt_s.shape, dtype=np.int64)
+            self.alt_s, self.alt_s_prime, self.s, self.s_prime = self.s, self.s_prime, self.alt_s, self.alt_s_prime
+        # Determine bounds
+        self.bounds = Bounds(self.observation_space)
+        self.n_possible_values = self.bounds.n_possible_values
+        # Fix Shapes
+        self.s, self.s_prime = self._fix_field(self.s), self._fix_field(self.s_prime)
+        self.alt_s, self.alt_s_prime = self._fix_field(self.alt_s), self._fix_field(self.alt_s_prime)
+
+
+@dataclass
+class MDPStep(object):
+    r"""
+    Contains all the variables to represent a Markov Decision Process step.
+
+    action (Action):
+
+    state (State):
+
+    done (bool):
+
+    reward (float):
+
+    episode (int):
+
+    step (int):
+
+    """
+    action: Action
+    state: State
+    done: torch.tensor
+    reward: torch.tensor
+    episode: int
+    step: int
+
+    def __post_init__(self):
+        self.action = deepcopy(self.action)
+        self.state = deepcopy(self.state)
+        self.reward = torch.tensor(data=self.reward).reshape(1, -1).float()
+        self.done = torch.tensor(data=self.done).reshape(1, -1).float()
+
+    def __str__(self): return ', '.join([str(self.__dict__[el]) for el in self.__dict__])
+
+    def clean(self):
+        r""" Removes fields that are generally unimportant (purely debugging) """
+        self.state.alt_s_prime = None
+        self.state.alt_s = None
+        self.state.observation_space = None
+        self.state.bounds = None
+        self.state.n_possible_values = None
+        self.action.raw_action = None
+        self.action.action_space = None
+        self.action.bounds = None
+        self.action.n_possible_values = None
+
+    @property
+    def data(self): return self.state.s_prime[0], self.state.alt_s_prime[0]
+    @property
+    def obj(self): return self.__dict__
+    @property
+    def s(self): return self.state.s
+    @property
+    def s_prime(self): return self.state.s_prime
+    @property
+    def alt_s_prime(self): return self.state.alt_s_prime
+    @property
+    def a(self): return self.action.taken_action
+    @property
+    def d(self):
+        return bool(self.done)
+
+
+class MDPCallback(LearnerCallback):
+    def __init__(self, learn):
+        """
+        Handles action assignment, episode naming.
 
         Args:
             learn:
-            mem_strategy:
-            k:
-            max_episodes:
-            episode:
-            iteration:
         """
         super().__init__(learn)
-        self.mem_strategy = mem_strategy
+        self.train_ds: MDPDataset = learn.data.train_ds
+        self.valid_ds: MDPDataset = None if learn.data.empty_val else learn.data.valid_ds
+
+    def on_batch_begin(self, last_input, last_target, train, **kwargs: Any):
+        """ Set the Action of a dataset, determine if still warming up. """
+        a = self.learn.predict(last_input)
+        if train: self.train_ds.action = Action(taken_action=a, action_space=self.train_ds.action.action_space)
+        else: self.valid_ds.action = Action(taken_action=a, action_space=self.train_ds.action.action_space)
+        self.train_ds.is_warming_up = self.learn.model.warming_up
+        if self.valid_ds is not None: self.valid_ds.is_warming_up = self.learn.model.warming_up
+        if not self.learn.model.warming_up and self.learn.loss_func is None:
+            self.learn.init_loss_func()
+        return {'skip_bwd': True}
+
+    def on_backward_end(self, **kwargs: Any):
+        return {'skip_step': True}
+
+    def on_step_end(self, **kwargs: Any):
+        return {'skip_zero': True}
+
+    def on_epoch_end(self, last_metrics, epoch, **kwargs: Any) -> None:
+        """ Updates the most recent episode number in both datasets. """
+        self.train_ds.x.set_recent_run_episode(epoch)
+        self.train_ds.episode = epoch
+        if last_metrics[0] is not None:
+            self.valid_ds.x.set_recent_run_episode(epoch)
+            self.valid_ds.episode = epoch
+
+
+class MDPMemoryManager(LearnerCallback):
+    def __init__(self, learn, strategy, k=1):
+        super().__init__(learn)
+        self.strategy = strategy
         self.k = k
-        self.ds = None
-        self.max_episodes = max_episodes
-        self.episode = episode
-        self.last_episode = episode
-        self.iteration = iteration
-        self._persist = max_episodes is not None
-
-        # Private Util Fields
-        self._train_episodes_keep = {}
-        self._valid_episodes_keep = {}
-        self._current_partition = 0
-
-        self._lower = 0
-        self._upper = 0
-
-    def _comp_less(self, key, d, episode): return d[key] < self.data.x.info[episode]
-    def _comp_greater(self, key, d, episode): return d[key] > self.data.x.info[episode]
-    def _dict_access(self, key, dictionary): return {key: dictionary[key]}
-
-    def _k_top_best(self, episode, episodes_keep):
-        best = dict(filter(partial(self._comp_greater, d=episodes_keep, episode=episode), episodes_keep))
-        return list(dict(sorted(best.items(), reverse=True)).keys())
-
-    def _k_top_worst(self, episode, episodes_keep):
-        worst = dict(filter(partial(self._comp_less, d=episodes_keep, episode=episode), episodes_keep))
-        return list(dict(sorted(worst.items())).keys())
-
-    def _k_top_both(self, episode, episodes_keep):
-        best, worst = self._k_top_best(episode, episodes_keep), self._k_top_worst(episode, episodes_keep)
-        if best: return best
-        return worst
-
-    def _k_partitions_best(self, episode, episodes_keep):
-        # Filter episodes by their partition
-        if episode >= self._upper: self.lower, self._upper = self._upper, self._upper + self.max_episodes // self.k
-        filtered_episodes = {ep: episodes_keep[ep] for ep in episodes_keep if self._lower <= ep < self._upper}
-        best = list(filter(partial(self._comp_greater, d=filtered_episodes, episode=episode), filtered_episodes))
-        best = {k: filtered_episodes[k] for k in filtered_episodes if k in best}
-        return list(dict(sorted(best.items(), reverse=True)).keys())
-
-    def _k_partitions_worst(self, episode, episodes_keep):
-        # Filter episodes by their partition
-        if episode >= self._upper: self.lower, self._upper = self._upper, self._upper + self.max_episodes // self.k
-        filtered_episodes = {ep: episodes_keep[ep] for ep in episodes_keep if self._lower <= ep < self._upper}
-        worst = list(filter(partial(self._comp_less, d=filtered_episodes, episode=episode), filtered_episodes))
-        worst = {k: filtered_episodes[k] for k in filtered_episodes if k in worst}
-        return list(dict(sorted(worst.items())).keys())
-
-    def _k_partitions_both(self, episode, episodes_keep):
-        best, worst = self._k_partitions_best(episode, episodes_keep), self._k_partitions_worst(episode, episodes_keep)
-        if best: return best
-        return worst
-
-    def on_loss_begin(self, **kwargs: Any):
-        self.iteration += 1
-
-    def manage_memory(self, episodes_keep, episode):
-        if episode not in self.ds.x.info: return
-        episodes_keep[episode] = self.ds.x.info[episode]
-        if len(episodes_keep) < self.k: return
-        # We operate of the second to most recent episode so the agent still has an opportunity to use the full episode
-        episode = episode - 1
-        if episode not in self.ds.x.info: return
+        self._strategy_fn_dict = {
+            'k_partitions_top': self.k_top
+        }
 
-        try:
-            # If the episodes to keep is full, then we have to decide which ones to remove
-            if self.mem_strategy == 'k_top_best': del_ep = self._k_top_best(episode, episodes_keep)
-            if self.mem_strategy == 'k_top_worst': del_ep = self._k_top_worst(episode, episodes_keep)
-            if self.mem_strategy == 'k_top_both': del_ep = self._k_top_both(episode, episodes_keep)
-            if self.mem_strategy == 'k_partitions_best': del_ep = self._k_partitions_best(episode, episodes_keep)
-            if self.mem_strategy == 'k_partitions_worst': del_ep = self.k_partitions_worst(episode, episodes_keep)
-            if self.mem_strategy == 'k_partitions_both': del_ep = self._k_partitions_both(episode, episodes_keep)
-            if self.mem_strategy == 'all': del_ep = [-1]
-
-            # If there are episodes to delete, then set them as the main episode to delete
-            if len(del_ep) != 0:
-                # episodes_keep[episode] = self.ds.x.info[episode]
-                del episodes_keep[del_ep[0]]
-                episode = del_ep[0]
-                self.ds.x.clean(episode)
-            self.ds.x.info = episodes_keep
-
-        except KeyError as e:
-            pass
-        except TypeError as e:
-            pass
-
-    def on_train_begin(self, n_epochs, **kwargs: Any):
-        self.max_episodes = n_epochs if not self._persist else self.max_episodes
-        self._upper = self.max_episodes // self.k
-
-    def on_epoch_begin(self, epoch, **kwargs: Any):
-        self.episode = epoch if not self._persist else self.episode + 1
-        self.iteration = 0
-
-    def on_epoch_end(self, **kwargs: Any) -> None:
-        if self.learn.data.train_ds is not None:
-            self.ds = self.learn.data.train_ds
-            self.manage_memory(self._train_episodes_keep, self.episode)
-        if self.learn.data.valid_dl is not None:
-            self.ds = self.learn.data.valid_ds
-            self.manage_memory(self._valid_episodes_keep, self.episode)
+    def _comp_less(self, key, info, episode):
+        return info[key] < info[episode]
 
+    def _comp_greater(self, key, info, episode):
+        return info[key] > info[episode]
 
-class MDPDataset(Dataset):
-    def __init__(self, env: gym.Env, feed_type=FEED_TYPE_STATE, render='rgb_array', max_steps=None, bs=8,
-                 x=None, memory_management_strategy='k_partitions_best', k=1, skip=False, embeddable=False):
-        """
-        Handles the running and loading of environments, as well as storing episode steps.
+    def k_top(self, info: Dict[int, List[Tuple[float, bool]]]):
+        # If the episode -1 is defined, then clean it, this is just placeholder data
+        if -1 in info and not info[-1][1]: return [-1]
+        # If the number of not-clean episodes is less than k, don't do anything
+        if len([k for k in info if not info[k][1]]) <= self.k: return None
+        # Collect k top episodes, then clean the rest
+        k_top = []
+        for episode in [i for i in info if i != -1]:
+            # If the episode is greater than all of the other episodes that are not already in k_top
+            compared = [info[episode][0] > info[k][0] for k in info if k not in k_top and k != -1 and k != episode]
+            if all(compared) and len(compared) != 0 and len(k_top) < self.k:
+                k_top.append(episode)
+
+        return list(set([k for k in info if not info[k][1]]) - set(k_top))
 
-        mem_strategy has a few settings. This is for inference on the existing data:
-        - (k_partitions_best) keep partition best episodes
-        - (k_partitions_both) keep partition worst best episodes
-        - (k_top_best)        keep high fidelity k top episodes
-        - (k_top_worst)       keep k top worst
-        - (k_top_both)        keep k top worst and best
-        - (non)               keep non, only load into memory (always keep first)
-        - (all):              keep all steps will be kept (most memory inefficient)
+
+    def on_epoch_end(self, **kwargs: Any):
+        for ds_type in [DatasetType.Train] if self.learn.data.empty_val else [DatasetType.Train, DatasetType.Valid]:
+            ds: MDPDataset = self.learn.dl(ds_type).dataset
+            episodes = self._strategy_fn_dict[self.strategy](ds.x.info)
+            if episodes is not None:
+                for e in episodes: ds.x.clean(e)
+
+
+class MDPDataset(Dataset):
+    def __init__(self, env: gym.Env, memory_manager, bs, render='rgb_array', feed_type=FEED_TYPE_STATE, max_steps=None):
+        r"""
+        Handles env execution and ItemList building.
 
         Args:
-            env:
-            feed_type:
-            render:
-            max_steps:
-            bs:
-            x:
-            memory_management_strategy: Reference above. Tests `self` how to keep memory usage down.
-            k: Related to the `mem_strategy` and is either k quartiles and k most.
-            save_every: Skip adding to datasets.
+            env: OpenAI environment to execute.
+            memory_manager: Handles how the list size will be reduced sch as removing image data.
+            bs: Size of a single batch for models and the dataset to use.
         """
-        self.k = k
-        self.skip = False
-        if skip:
-            self.skip = skip
-            return
-        self.mem_strat = memory_management_strategy
-        self.bs = bs
-        # noinspection PyUnresolvedReferences,PyProtectedMember
-        env._max_episode_steps = env.spec.max_episode_steps if not hasattr(env, '_max_episode_steps') else env._max_episode_steps
-        self.max_steps = env._max_episode_steps if max_steps is None else max_steps
+        self.env = env
         self.render = render
         self.feed_type = feed_type
-        self.env = env
-        # MDP specific values
-        self.actions = self.get_random_action(env.action_space)
-        if isinstance(env.action_space, Box): self.raw_action = np.random.randn((env.action_space.shape[0]))
-        elif isinstance(env.action_space, Discrete): self.raw_action = np.random.randn((env.action_space.n))
-        else: self.raw_action = self.get_random_action(env.action_space)
-
-        self.is_done = True
-        self.current_state = None
-        self.current_image = None
-
-        self.embeddable = embeddable
-
-        self.env_specific_handle()
-        self.counter = -1
-        self.episode = 0
-        self.x = MarkovDecisionProcessList() if x is None else x  # self.new(0)
-        self.item = None
-        self.episodes_to_keep = {}
+        self.bs = bs
+        self._max_steps = max_steps
+        self.action = Action(taken_action=self.env.action_space.sample(), action_space=self.env.action_space)
+        self.state = None
+        self.s_prime, self.alt_s_prime = None, None
+        self.callback = [MDPCallback, memory_manager]
+        # Tracking fields
+        self.episode = -1
+        self.counter = 0
+        self.is_warming_up = True
+
+        # FastAI fields
+        self.x = MDPList([])
+        self.item: Union[MDPStep, None] = None
+        self.new(None)
+
+    def aug_steps(self, steps):
+        if self.is_warming_up and steps < self.bs: return self.bs
+        return steps
 
     @property
-    def state_size(self):
-        if self.feed_type == FEED_TYPE_STATE:
-            return self.env.observation_space
-        else:
-            return gym.spaces.Box(0, 255, shape=self.env.render('rgb_array').shape)
-
-    def __del__(self):
-        self.env.close()
+    def max_steps(self):
+        if self._max_steps is not None: return self._max_steps
+        if hasattr(self.env, '_max_episode_steps'): return getattr(self.env, '_max_episode_steps')
+        if self.env.spec.max_episode_steps is not None: return self.env.spec.max_episode_steps
 
-    def env_specific_handle(self):
-        if isinstance(self.env, TimeLimit) and isinstance(self.env.unwrapped, AlgorithmicEnv):
-            self.render = 'ansi' if self.render == 'rgb_array' else self.render
-        if isinstance(self.env, TimeLimit) and isinstance(self.env.unwrapped, discrete.DiscreteEnv):
-            self.render = 'ansi' if self.render == 'rgb_array' else self.render
+        msg = f'Env {self.env.spec.id} does not have max episode steps. '
+        msg += ' Either pass the max steps as a param, or catch this exception then pass a default max step param.'
+        raise MaxEpisodeStepsMissingError(msg)
 
-    def get_random_action(self, action_space=None):
-        action_space = action_space if action_space is not None else self.env.action_space
-        return action_space.sample()
-
-    def _get_image(self):
-        # Specifically for the stupid blackjack-v0 env >:(
+    @property
+    def image(self):
+        r""" Needed because of blackjack-v0 env >:( """
         try:
             current_image = self.env.render('rgb_array')
             if self.render == 'human': self.env.render(self.render)
@@ -250,66 +412,61 @@ def _get_image(self):
             current_image = None
         return current_image
 
-    def new(self, _):
+    def __del__(self):
+        self.env.close()
+
+    def __len__(self):
+        return self.aug_steps(self.max_steps)
+
+    def stage_1_env_reset(self):
+        r"""
+        Handles environment resetting and dataset batch termination.
+
+        We are interested in the entire dataset ending when an item is done.
+
+        Returns: The state space and the image after a reset.
+
         """
-        New element is a query of the environment
+        if self.counter != 0 and self.item.d:
+            self.counter = 0
+            if not self.is_warming_up: raise StopIteration
+        if self.item is None or self.item.d: return self.env.reset(), self.image
+        return self.s_prime, self.alt_s_prime
 
-        Args:
-            _:
+    def stage_2_env_step(self) -> Tuple[np.array, float, bool, None, np.array]:
+        r"""
+        Handles taking a step in the environment.
 
-        Returns:
+        We want to cancel the env early if we are at our max step amount.
 
+        Returns: The state, reward, whether the episode is done, and the image.
         """
-        # First Phase: decide on episode reset. Collect current state and image representations.
-        if self.is_done or self.counter >= self.max_steps - 3:
-            self.current_state, reward, self.is_done, info = self.env.reset(), 0, False, {}
-            if type(self.current_state) is not list and type(self.current_state) is not np.ndarray: self.current_state = [self.current_state]
-            # Specifically for the stupid blackjack-v0 env >:(
-            self.current_image = self._get_image()
-
-        result_state, reward, self.is_done, info = self.env.step(self.actions)
-        if self.is_done and self.counter == -1:
-            self.current_state, reward, self.is_done, info = self.env.reset(), 0, False, {}
-
-        if type(result_state) is not list and type(result_state) is not np.ndarray: result_state = [result_state]
-        result_image = self._get_image()
-        self.counter += 1
+        s_prime, reward, done, _ = self.env.step(self.action.get_single_action())
+        # If we are at the max steps limit but the env is not done, we need to force an env end.
+        # However, we need the loop to iterate +1 time for allowing the stage_1_env_reset
+        if len(self) - 2 == self.counter: done = True
+        return s_prime, reward, done, _, self.image
 
-        # Second Phase: Generate MDP slice
-        result_state = result_image.transpose(2, 0, 1) if self.feed_type == FEED_TYPE_IMAGE and result_image is not None else result_state
-        current_state = self.current_image.transpose(2, 0, 1) if self.feed_type == FEED_TYPE_IMAGE and self.current_image is not None else self.current_state
-        alternate_state = result_state if self.feed_type == FEED_TYPE_IMAGE or result_state is None else result_image
-        items = MarkovDecisionProcessSlice(state=np.copy(current_state), state_prime=np.copy(result_state),
-                                           alt_state=np.copy(alternate_state), action=np.copy(self.actions),
-                                           reward=reward, done=copy(self.is_done), feed_type=copy(self.feed_type),
-                                           episode=copy(self.episode), raw_action=self.raw_action)
-        self.current_state = copy(result_state)
-        self.current_image = copy(result_image)
+    def new(self, _):
+        s, alt_s = self.stage_1_env_reset()
+        self.s_prime, reward, done, _, self.alt_s_prime = self.stage_2_env_step()
+        # If both the current item and the done are both true, then we need to retry the env
+        if self.item is not None and self.item.d and done: return self.new()
 
-        list_item = MarkovDecisionProcessList([items])
-        return list_item
+        self.state = State(s, self.s_prime, alt_s, self.alt_s_prime,  self.env.observation_space, self.feed_type)
+        self.item = MDPStep(self.action, self.state, done, reward, self.episode, self.counter)
+        self.counter += 1
 
-    def __len__(self):
-        return self.max_steps
+        return MDPList([self.item])
 
-    def __getitem__(self, _) -> 'MDPDataset':
+    def __getitem__(self, _):
         item = self.new(_)
-        if (self.x and self.is_done and self.counter != -1) or \
-                (self.counter >= self.max_steps - 2):
-            self.x.add(item)
-            self.counter = -1
-            self.episode += 1
-            raise StopIteration
-
         self.x.add(item)
-        # x = self.x[idxs]  # Perhaps have this as an option?
-        x = self.x[-1]
-        return x.copy() if isinstance(x, np.ndarray) else x
+        return self.x[-1]
 
     def to_csv(self, root_path, name):
-        df = self.x.to_df()  # type: pd.DateFrame()
         if not os.path.exists(root_path): os.makedirs(root_path)
-        df.to_csv(root_path / (name + '.csv'), index=False)
+        self.x.to_df().to_csv(root_path / (name + '.csv'), index=False)
 
     def to_pickle(self, root_path, name):
         if not os.path.exists(root_path): os.makedirs(root_path)
@@ -318,149 +475,71 @@ def to_pickle(self, root_path, name):
 
 
 class MDPDataBunch(DataBunch):
-    def _get_sizes_and_possible_values(self, item):
-        if isinstance(item, Discrete) and len(item.shape) != 0: return item.n, item.n
-        if isinstance(item, Discrete) and len(item.shape) == 0: return 1, item.n
-        if isinstance(item, Box) and (item.dtype == int or item.dtype == np.uint8):
-            return item.shape if len(item.shape) > 1 else item.shape[0], np.prod(item.high)
-        if isinstance(item, Box) and item.dtype == np.float32:
-            return item.shape if len(item.shape) > 1 else item.shape[0], np.inf
-
-    # noinspection PyUnresolvedReferences
-    def get_action_state_size(self):
-        if self.train_ds is not None:
-            a_s, s_s = self.train_ds.env.action_space, self.train_ds.state_size
-        elif self.valid_ds is not None:
-            a_s, s_s = self.valid_ds.env.action_space, self.valid_ds.state_size
-        else:
-            return None
-        return tuple(map(self._get_sizes_and_possible_values, [a_s, s_s]))
-
-    @classmethod
-    def from_env(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', test_ds: Optional[Dataset] = None,
-                 path: PathOrStr = None, bs: int = 1, feed_type=FEED_TYPE_STATE, val_bs: int = None,
-                 num_workers: int = 0, embed=False, memory_management_strategy='k_partitions_both',
-                 dl_tfms: Optional[Collection[Callable]] = None, device: torch.device = None,
-                 collate_fn: Callable = data_collate, no_check: bool = False, add_valid=True, **dl_kwargs):
-        """
-
 
-        Args:
-            env_name:
-            max_steps:
-            render:
-            test_ds:
-            path:
-            bs:
-            feed_type:
-            val_bs:
-            num_workers:
-            embed:
-            memory_management_strategy: has a few settings. This is for inference on the existing data.
-                - (k_partitions_best) keep partition best episodes
-                - (k_partitions_both) keep partition worst best episodes
-                - (k_top_best)        keep high fidelity k top episodes
-                - (k_top_worst)       keep k top worst
-                - (k_top_both)        keep k top worst and best
-                - (non)               keep non, only load into memory (always keep first)
-                - (all):              keep all steps will be kept (most memory inefficient)
-            dl_tfms:
-            device:
-            collate_fn:
-            no_check:
-            add_valid:
-            **dl_kwargs:
-
-        Returns:
+    def __del__(self):
+        if self.train_dl is not None: del self.train_dl.train_ds
+        if self.valid_dl is not None: del self.valid_dl.valid_ds
 
-        """
+    @property
+    def state_action_sample(self) -> Union[Tuple[State, Action], None]:
+        ds = ifnone(self.train_ds, self.valid_ds)  # type: MDPDataset
+        return ds.state, ds.action if ds is not None else None
 
-        try:
-            # train_list = MDPDataset(gym.make(env_name), max_steps=max_steps, render=render)
-            # valid_list = MDPDataset(gym.make(env_name), max_steps=max_steps, render=render)
-            env = gym.make(env_name)
-            val_bs = bs if val_bs is None else val_bs
-            train_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, embeddable=embed,
-                                    memory_management_strategy=memory_management_strategy)
-            if add_valid: valid_list = MDPDataset(env, max_steps=max_steps, render=render, bs=val_bs, embeddable=embed,
-                                                  memory_management_strategy=memory_management_strategy)
-            else: valid_list = None
-        except error.DependencyNotInstalled as e:
-            print('Mujoco is not installed. Returning None')
-            if e.args[0].lower().__contains__('mujoco'): return None
-
-        bs, val_bs = 1, None
+    @classmethod
+    def from_env(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', bs: int = 64,
+                 feed_type=FEED_TYPE_STATE, num_workers: int = 0, memory_management_strategy='k_partitions_top',
+                 split_env_init=True, device: torch.device = None, no_check: bool = False,
+                 add_valid=True, **dl_kwargs):
+
+        env = gym.make(env_name)
+        memory_manager = partial(MDPMemoryManager, strategy=memory_management_strategy)
+        train_list = MDPDataset(env, max_steps=max_steps, feed_type=feed_type, render=render, bs=bs,
+                                memory_manager=memory_manager)
+        if add_valid:
+            valid_list = MDPDataset(env if split_env_init else gym.make(env_name), max_steps=max_steps,
+                                    render=render, bs=bs,  feed_type=feed_type, memory_manager=memory_manager)
+        else:
+            valid_list = None
         path = './data/' + env_name.split('-v')[0].lower() + datetime.now().strftime('%Y%m%d%H%M%S')
-
-        return cls.create(train_list, valid_list, num_workers=num_workers, test_ds=test_ds, path=path, bs=bs,
-                          feed_type=feed_type, val_bs=val_bs, dl_tfms=dl_tfms, device=device, collate_fn=collate_fn,
-                          no_check=no_check, **dl_kwargs)
+        return cls.create(train_list, valid_list, num_workers=num_workers, bs=1, device=device, **dl_kwargs)
 
     @classmethod
-    def from_pickle(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', test_ds: Optional[Dataset] = None,
-                    path: PathOrStr = None, bs: int = 1, feed_type=FEED_TYPE_STATE, val_bs: int = None,
-                    num_workers: int = 0,
-                    dl_tfms: Optional[Collection[Callable]] = None, device: torch.device = None,
-                    collate_fn: Callable = data_collate, no_check: bool = False, add_valid=True, **dl_kwargs):
+    def from_pickle(cls, env_name='CartPole-v1', bs: int = 1, feed_type=FEED_TYPE_STATE, render='rgb_array',
+                    max_steps=None, add_valid=True, num_workers: int = defaults.cpus, path: PathOrStr = None,
+                    device: torch.device = None, **dl_kwargs):
 
         if path is None:
             path = [_ for _ in os.listdir('./data/') if _.__contains__(env_name.split('-v')[0].lower())]
             if not path: raise IOError(f'There is no pickle dirs file found in ./data/ with the env name {env_name}')
             path = Path('./data/' + path[0])
 
-        try:
-            env = gym.make(env_name)
-            val_bs = bs if val_bs is None else val_bs
-            train_ls = pickle.load(open(path / 'train.pickle', 'rb'))
-            train_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, x=train_ls)
-
-            if add_valid:
-                valid_ls = pickle.load(open(path / 'valid.pickle', 'rb'))
-                valid_list = MDPDataset(env, max_steps=max_steps, render=render, bs=val_bs, x=valid_ls)
-            else:
-                valid_list = None
-
-        except error.DependencyNotInstalled as e:
-            print('Mujoco is not installed. Returning None')
-            if e.args[0].lower().__contains__('mujoco'): return None
-
-        bs, val_bs = 1, None
-        if path is None: path = './data/' + env_name.split('-v')[0].lower() + datetime.now().strftime('%Y%m%d%H%M%S')
+        env = gym.make(env_name)
+        train_ls = pickle.load(open(path / 'train.pickle', 'rb'))
+        train_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, x=train_ls)
 
-        return cls.create(train_list, valid_list, num_workers=num_workers, test_ds=test_ds, path=path, bs=bs,
-                          feed_type=feed_type, val_bs=val_bs, dl_tfms=dl_tfms, device=device, collate_fn=collate_fn,
-                          no_check=no_check, **dl_kwargs)
+        if add_valid:
+            valid_ls = pickle.load(open(path / 'valid.pickle', 'rb'))
+            valid_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, x=valid_ls)
+        else:
+            valid_list = None
 
-    @classmethod
-    def from_csv(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', test_ds: Optional[Dataset] = None,
-                 path: PathOrStr = None, bs: int = 1, feed_type=FEED_TYPE_STATE, val_bs: int = None,
-                 num_workers: int = 0,
-                 dl_tfms: Optional[Collection[Callable]] = None, device: torch.device = None,
-                 collate_fn: Callable = data_collate, no_check: bool = False, add_valid=True, **dl_kwargs):
-        raise NotImplementedError('Not implemented for now. Saving state data into a csv seems extremely clunky.'
-                                  ' Suggested to use to_pickle and from_pickle due to easier numpy conversion.')
+        if path is None: path = './data/' + env_name.split('-v')[0].lower() + datetime.now().strftime('%Y%m%d%H%M%S')
+
+        return cls.create(train_list, valid_list, num_workers=num_workers, path=path, bs=bs, feed_type=feed_type,
+                          val_bs=1, device=device, **dl_kwargs)
 
     @classmethod
-    def create(cls, train_ds: MDPDataset, valid_ds: MDPDataset = None,
-               test_ds: Optional[Dataset] = None, path: PathOrStr = '.', bs: int = 1,
-               feed_type=FEED_TYPE_STATE,
-               val_bs: int = None, num_workers: int = defaults.cpus, dl_tfms: Optional[Collection[Callable]] = None,
-               device: torch.device = None, collate_fn: Callable = data_collate, no_check: bool = False,
-               **dl_kwargs) -> 'DataBunch':
+    def create(cls, train_ds: MDPDataset, valid_ds: MDPDataset = None, bs: int = 1,
+               num_workers: int = defaults.cpus, device: torch.device = None, **dl_kwargs) -> 'DataBunch':
         """Create a `DataBunch` from `train_ds`, `valid_ds` and maybe `test_ds` with a batch size of `bs`.
         Passes `**dl_kwargs` to `DataLoader()`
 
         Since this is a MarkovProcess, the batches need to be `bs=1` (for now...)
         """
-        train_ds.feed_type = feed_type
-        if valid_ds is not None: valid_ds.feed_type = feed_type
-        if test_ds is not None: test_ds.feed_type = feed_type
-
-        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
-        val_bs = ifnone(val_bs, bs)
+        datasets = cls._init_ds(train_ds, valid_ds, None)
         dls = [DataLoader(d, b, shuffle=s, drop_last=s, num_workers=num_workers, **dl_kwargs) for d, b, s in
-               zip(datasets, (bs, val_bs, val_bs, val_bs), (False, False, False, False)) if d is not None]
-        databunch = cls(*dls, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)
+               zip(datasets, (bs, bs, bs, bs), (False, False, False, False)) if d is not None]
+        databunch = cls(*dls, **dl_kwargs)
         if valid_ds is None: databunch.valid_dl = None
         return databunch
 
@@ -474,15 +553,13 @@ def to_pickle(self):
 
     @staticmethod
     def _init_ds(train_ds: Dataset, valid_ds: Dataset, test_ds: Optional[Dataset] = None):
-        # train_ds, but without training tfms
-        fix_ds = copy(train_ds)  # valid_ds.new(train_ds.x) if hasattr(valid_ds, 'new') else train_ds
-        return [o for o in (train_ds, valid_ds, fix_ds, test_ds) if o is not None]
+        return [o for o in (train_ds, valid_ds, copy(train_ds), test_ds) if o is not None]
 
 
-class MarkovDecisionProcessList(ItemList):
+class MDPList(ItemList):
     _bunch = MDPDataBunch
 
-    def __init__(self, items=np.array([]), feed_type=FEED_TYPE_IMAGE, **kwargs):
+    def __init__(self, items: Iterator, **kwargs):
         """
         Represents a MDP sequence over episodes.
 
@@ -490,7 +567,7 @@ def __init__(self, items=np.array([]), feed_type=FEED_TYPE_IMAGE, **kwargs):
         Notes:
             Two important fields for you to be aware of: `items` and `x`.
             `x` is just the values being used for directly being feed into the model.
-            `items` contains an ndarray of MarkovDecisionProcessSlice instances. These contain the the primary values
+            `items` contains an ndarray of MarkovDecisionProcessSliceAlpha instances. These contain the the primary values
             in x, but also the other important properties of a MDP.
 
         Args:
@@ -498,73 +575,32 @@ def __init__(self, items=np.array([]), feed_type=FEED_TYPE_IMAGE, **kwargs):
             feed_type:
             **kwargs:
         """
-        super(MarkovDecisionProcessList, self).__init__(items, **kwargs)
-        self.feed_type = feed_type
-        self.copy_new.append('feed_type')
-        self.ignore_empty = True
+        super().__init__(items, **kwargs)
         self.info = {}
 
+    def set_recent_run_episode(self, episode):
+        for i, item in enumerate(reversed(self.items)):
+            if item.d and i != 0: break
+            item.episode = episode
+            self._update_info(episode, item)
+
     def clean(self, episode):
-        for item in self.items:
-            if item.episode == episode: item.clean()
+        self.info[episode][1] = True
+        [item.clean() for item in self.items if item.episode == episode]
 
-    def add(self, items: 'ItemList'):
-        # Update the episode related composition information
-        for item in items.items:
-            if item.episode in self.info: self.info[item.episode] = float(np.sum(self.info[item.episode] + item.reward))
-            else: self.info[item.episode] = float(item.reward)
+    def _update_info(self, ep, item: MDPStep):
+        self.info[ep] = float(np.sum(self.info[ep][0] + float(item.reward))) if ep in self.info else float(item.reward)
+        self.info[ep] = [self.info[ep], False]
 
+    def add(self, items: 'ItemList'):
+        [self._update_info(item.episode, item) for item in items.items]
         super().add(items)
 
-    def to_df(self):
-        return pd.DataFrame([i.obj for i in self.items])
+    def to_df(self): return pd.DataFrame([i.obj for i in self.items])
 
-    def to_dict(self):
-        return [i.obj for i in self.items]
+    def to_dict(self): return [i.obj for i in self.items]
 
-    def get(self, i):
-        return self.items[i].data
+    def get(self, i): return self.items[i].data
 
     def reconstruct(self, t: Tensor, x: Tensor = None):
-        if self.feed_type == FEED_TYPE_IMAGE:
-            return MarkovDecisionProcessSlice(state=Image(t), state_prime=Image(x[0]),
-                                              alt_state=Floats(x[1]), action=Floats(x[1]),
-                                              reward=Floats(x[2]), done=x[3], feed_type=self.feed_type)
-        else:
-            return MarkovDecisionProcessSlice(state=Floats(t), state_prime=Floats(x[0]),
-                                              alt_state=Image(x[1]), action=Floats(x[1]),
-                                              reward=Floats(x[2]), done=x[3], feed_type=self.feed_type)
-
-
-class MarkovDecisionProcessSlice(ItemBase):
-    # noinspection PyMissingConstructor
-    def __init__(self, state, state_prime, alt_state, action, reward, done, episode, raw_action,
-                 feed_type=FEED_TYPE_IMAGE):
-        action = np.copy(action)
-        raw_action = np.copy(raw_action)
-        if len(action.shape) == 0: action = np.array(action, ndmin=1)
-        if isinstance(np.copy(action), int): action = np.array(action, ndmin=1)
-        if isinstance(reward, float) or isinstance(reward, int): reward = np.array(reward, ndmin=1)
-        self.current_state, self.result_state, self.alternate_state, self.actions, self.reward, self.done, self.episode, self.raw_action = state, state_prime, alt_state, action, reward, done, episode, raw_action
-        self.data, self.obj = alt_state if feed_type == FEED_TYPE_IMAGE else state, \
-                              {'state': self.current_state, 'state_prime': self.result_state,
-                               'alt_state': self.alternate_state, 'action': action, 'reward': reward, 'done': done,
-                               'episode': episode, 'feed_type': feed_type, 'raw_action': raw_action}
-
-    def clean(self, only_alt=False):
-        if not only_alt:
-            self.current_state, self.result_state = None, None
-            self.obj['state'], self.obj['state_prime'] = None, None
-
-        self.alternate_state, self.obj['alt_state'] = None, None
-
-    def __str__(self):
-        formatted = (
-            map(lambda y: f'{y}:{self.obj[y].shape}', filter(lambda y: y.__contains__('state'), self.obj.keys())),
-            map(lambda y: f'{y}:{self.obj[y]}', filter(lambda y: not y.__contains__('state'), self.obj.keys()))
-        )
-
-        return ', '.join(list(formatted[0]) + list(formatted[1]))
-
-    def to_one(self):
-        return Image(self.alternate_state)
+        raise NotImplementedError('Not sure when this will be important.')
diff --git a/fast_rl/core/agent_core.py b/fast_rl/core/agent_core.py
index 4836eb0..cd9ffe2 100644
--- a/fast_rl/core/agent_core.py
+++ b/fast_rl/core/agent_core.py
@@ -4,17 +4,12 @@
 from collections import deque
 from functools import partial
 from math import ceil
-from typing import List, Optional
+from typing import List
 
 import gym
 import numpy as np
 import torch
 from fastai.basic_train import *
-from fastai.basic_train import BasicLearner, CallbackList, OptMetrics, master_bar, is_listy, first_el, to_np
-from fastai.callback import CallbackHandler
-from fastprogress import progress_bar
-
-from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSlice
 from fast_rl.core.data_structures import SumTree
 
 
@@ -58,7 +53,7 @@ def __init__(self, epsilon_start, epsilon_end, decay, start_episode=0, end_episo
 
     def perturb(self, action, action_space: gym.Space):
         """
-        TODO for now does random discrete selection. Move to continuous soon.
+        TODO for now does random discrete selection. Move to discrete soon.
 
         Args:
             action:
@@ -127,8 +122,8 @@ def refresh(self, **kwargs):
 
 class ExperienceReplay(Experience):
     def __init__(self, memory_size, **kwargs):
-        """
-        Basic store-er of state space transitions for training agents.
+        r"""
+        Basic store-er of s space transitions for training agents.
 
         References:
             [1] Mnih, Volodymyr, et al. "Playing atari with deep reinforcement learning."
@@ -139,7 +134,7 @@ def __init__(self, memory_size, **kwargs):
         """
         super().__init__(memory_size, **kwargs)
         self.max_size = memory_size
-        self.memory = deque(maxlen=memory_size)  # type: List[MarkovDecisionProcessSlice]
+        self.memory = deque(maxlen=memory_size)  # type: List[MarkovDecisionProcessSliceAlpha]
 
     def __len__(self):
         return len(self.memory)
@@ -149,7 +144,7 @@ def sample(self, batch, **kwargs):
         return random.sample(self.memory, batch)
 
     def update(self, item, **kwargs):
-        if self.reduce_ram: item.clean(True)
+        if self.reduce_ram: item.clean()
         self.memory.append(copy.deepcopy(item))
 
 
@@ -218,7 +213,7 @@ def update(self, item, **kwargs):
 
         """
         maximal_priority = self.alpha
-        if self.reduce_ram: item.clean(True)
+        if self.reduce_ram: item.clean()
         self.memory.add(np.abs(maximal_priority) + self.epsilon, item)
 
 
@@ -234,91 +229,3 @@ def __init__(self, memory_size):
             memory_size:
         """
         super().__init__(memory_size)
-
-
-def loss_batch(model, cb_handler: Optional[CallbackHandler]):
-    """ TODO will be different. Is there anything extra needed from here? """
-    if model.out is not None: cb_handler.on_loss_begin(model.out)
-    if model.loss is None: return None
-    cb_handler.on_backward_begin(model.loss)
-    cb_handler.on_backward_end()
-    cb_handler.on_step_end()
-    return model.loss.detach().cpu()
-
-
-def validate(learn, dl, cb_handler: Optional[CallbackHandler] = None,
-             pbar=None, average=True, n_batch: Optional[int] = None):
-    learn.model.eval()
-    with torch.no_grad():
-        val_losses, nums = [], []
-        if cb_handler: cb_handler.set_dl(dl)
-        # TODO 1st change: in fit function, original uses xb, yb. Maybe figure out what 2nd value to include?
-        for element in progress_bar(dl, parent=pbar):
-            learn.data.valid_ds.actions = learn.predict(element)
-            if cb_handler: element = cb_handler.on_batch_begin(element, learn.data.valid_ds.actions, train=False)
-            val_loss = loss_batch(learn.model, cb_handler=cb_handler)
-            if val_loss is None: continue
-            val_losses.append(val_loss)
-            r = dl.actions if is_listy(dl.actions) else np.array([dl.actions])
-            nums.append(first_el(r).shape[0])
-            if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
-            if n_batch and (len(nums) >= n_batch): break
-        nums = np.array(nums, dtype=np.float32)
-        if average and val_losses: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum()
-        else: return val_losses
-
-
-def fit(epochs: int, learn: BasicLearner, callbacks: Optional[CallbackList] = None, metrics: OptMetrics = None):
-    """
-    Takes a RL Learner and trains it on a given environment.
-
-    Important behavior notes:
-
-    - The original Fastai fit function outputs the loss for every epoch. Since many RL models need to fill a memory buffer before optimization, the fit function for epoch 0 will run multiple episodes until the memory is filled.
-    - **So when you see the agent running through episodes without output, please note that it is most likely filling the memory first before properly printing.**
-
-    Args:
-        epochs:
-        learn:
-        callbacks:
-        metrics:
-
-    Returns:
-
-    """
-    assert len(learn.data.train_dl) != 0, f"""Your training dataloader is empty, can't train a model. Use a smaller 
-        batch size (batch size={learn.data.train_dl.batch_size} for {len(learn.data.train_dl.dataset)} elements)."""
-    # Since CallbackHandler is a dataclass, these input fields will be automatically populated via
-    # a default __init__
-    cb_handler = CallbackHandler(callbacks, metrics)
-    cb_handler.state_dict['skip_validate'] = learn.data.empty_val
-    pbar = master_bar(range(epochs))
-    cb_handler.on_train_begin(epochs, pbar=pbar, metrics=metrics)
-    # Note that metrics in the on_train begin method need a name field.
-    exception = False
-    loss = None
-    try:
-        for epoch in pbar:
-            learn.model.train()
-            cb_handler.set_dl(learn.data.train_dl)
-            cb_handler.on_epoch_begin()
-            # TODO 1st change: While the loss is None, the model's memory has not filled  and / or is not ready.
-            while loss is None:
-                # TODO 2nd change: in fit function, original uses xb, yb. Maybe figure out what 2nd value to include?
-                for element in progress_bar(learn.data.train_dl, parent=pbar):
-                    # TODO 3rd change: get the action for the given state. Move to on batch begin callback?
-                    learn.data.train_ds.actions = learn.predict(element)
-                    cb_handler.on_batch_begin(element, learn.data.train_ds.actions)
-                    # TODO 4th change: loss_batch is way simpler... What is a batch to be defined as?
-                    loss = loss_batch(learn.model, cb_handler)
-                    if cb_handler.on_batch_end(loss): break
-
-            loss = None
-            if not cb_handler.skip_validate and not learn.data.empty_val:
-                val_loss = validate(learn, learn.data.valid_dl, cb_handler=cb_handler, pbar=pbar)
-            else: val_loss = None
-            if cb_handler.on_epoch_end(val_loss): break
-    except Exception as e:
-        exception = e
-        raise
-    finally: cb_handler.on_train_end(exception)
diff --git a/fast_rl/core/basic_train.py b/fast_rl/core/basic_train.py
new file mode 100644
index 0000000..1a43d8c
--- /dev/null
+++ b/fast_rl/core/basic_train.py
@@ -0,0 +1,31 @@
+from fastai.basic_train import Learner
+
+
+class WrapperLossFunc(object):
+    def __init__(self, learn):
+        self.learn = learn
+
+    def __call__(self, *args, **kwargs):
+        return self.learn.model.loss
+
+
+class AgentLearner(Learner):
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        self._loss_func = WrapperLossFunc(self)
+        self.loss_func = None
+        self.callback_fns += self.model.learner_callbacks + self.data.train_ds.callback
+
+    def predict(self, element, **kwargs):
+        return self.model.pick_action(element)
+
+    def init_loss_func(self):
+        r"""
+         Initializes the loss function wrapper for logging loss.
+
+         Since most RL models have a period of warming up such as filling memory buffers, we cannot log any loss.
+         By default, the learner will have a `None` loss function, and so the fit function will not try to log that
+         loss.
+         """
+        self.loss_func = self._loss_func
diff --git a/fast_rl/tests/test_DataBunch.py b/fast_rl/tests/test_DataBunch.py
index d80bb26..09cd915 100644
--- a/fast_rl/tests/test_DataBunch.py
+++ b/fast_rl/tests/test_DataBunch.py
@@ -3,23 +3,23 @@
 from fast_rl.util.file_handlers import get_absolute_path
 
 
-def test_ImageDataBunch_init():
-    """
-    For understanding various databunches.
-
-    For example, ImageDataBunch in the from folder:
-
-    Src is originally an ImageList, but the following code:
-
-    `src = src.label_from_folder(classes=classes)`
-
-    CHANGES THE CLASS TO A LABELLISTS?!?!?
-
-    In other words, the ImageList is capable of turning into a dataset.
-
-    :return:
-    """
-    data = ImageDataBunch.from_folder(get_absolute_path('data'), valid_pct=0.5)
-
-    for e in data.train_ds:
-        print(e)
\ No newline at end of file
+# def test_ImageDataBunch_init():
+#     """
+#     For understanding various databunches.
+#
+#     For example, ImageDataBunch in the from folder:
+#
+#     Src is originally an ImageList, but the following code:
+#
+#     `src = src.label_from_folder(classes=classes)`
+#
+#     CHANGES THE CLASS TO A LABELLISTS?!?!?
+#
+#     In other words, the ImageList is capable of turning into a dataset.
+#
+#     :return:
+#     """
+#     data = ImageDataBunch.from_folder(get_absolute_path('data'), valid_pct=0.5)
+#
+#     for e in data.train_ds:
+#         print(e)
\ No newline at end of file
diff --git a/fast_rl/tests/test_Envs.py b/fast_rl/tests/test_Envs.py
index 7e65d2b..588ca46 100644
--- a/fast_rl/tests/test_Envs.py
+++ b/fast_rl/tests/test_Envs.py
@@ -2,47 +2,47 @@
 import numpy as np
 import pytest
 from fast_rl.core.Envs import Envs
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
+# from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha
 
 
 
 
-def test_individual_env():
-    msg = 'the datasets in the dataloader seem to be different from the data bunches datasets...'
+# def test_individual_env():
+#     msg = 'the datasets in the dataloader seem to be different from the data bunches datasets...'
 
-    max_steps = 50
+#     max_steps = 50
 
-    env = 'CarRacing-v0'
-    print(f'Testing {env}')
-    mdp_databunch = MDPDataBunch.from_env(env, max_steps=max_steps, num_workers=0)
-    epochs = 1
+#     env = 'CarRacing-v0'
+#     print(f'Testing {env}')
+#     mdp_databunch = MDPDataBunchAlpha.from_env(env, max_steps=max_steps, num_workers=0)
+#     epochs = 1
 
-    assert max_steps == len(mdp_databunch.train_dl)
-    assert max_steps == len(mdp_databunch.valid_dl)
+#     assert max_steps == len(mdp_databunch.train_dl)
+#     assert max_steps == len(mdp_databunch.valid_dl)
 
-    for epoch in range(epochs):
-        for _ in mdp_databunch.train_dl:
-            mdp_databunch.train_ds.actions = mdp_databunch.train_ds.get_random_action()
-            # print(f'state {element.shape} action {mdp_databunch.train_dl.dl.dataset.actions}')
-            assert np.sum(
-                np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size(
-                mdp_databunch.train_ds.actions), msg
+#     for epoch in range(epochs):
+#         for _ in mdp_databunch.train_dl:
+#             mdp_databunch.train_ds.actions = mdp_databunch.train_ds.get_random_action()
+#             # print(f's {element.shape} action {mdp_databunch.train_dl.dl.dataset.actions}')
+#             assert np.sum(
+#                 np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size(
+#                 mdp_databunch.train_ds.actions), msg
 
-        for _ in mdp_databunch.valid_dl:
-            mdp_databunch.valid_ds.actions = mdp_databunch.valid_ds.get_random_action()
-            # print(f'state {element.shape} action {mdp_databunch.valid_dl.dl.dataset.actions}')
-            assert np.sum(
-                np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size(
-                mdp_databunch.train_ds.actions), msg
+#         for _ in mdp_databunch.valid_dl:
+#             mdp_databunch.valid_ds.actions = mdp_databunch.valid_ds.get_random_action()
+#             # print(f's {element.shape} action {mdp_databunch.valid_dl.dl.dataset.actions}')
+#             assert np.sum(
+#                 np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size(
+#                 mdp_databunch.train_ds.actions), msg
 
 
-def test_individual_env_no_dl():
-    """Just a nice place to do sanity testing on new / untested envs."""
-    env = gym.make('maze-random-10x10-plus-v0')
-    for episode in range(2):
-        done = False
-        env.reset()
-        while not done:
-            output = env.step(env.action_space.sample())
-            done = output[2]
-            env.render('human')
+# def test_individual_env_no_dl():
+#     """Just a nice place to do sanity testing on new / untested envs."""
+#     env = gym.make('maze-random-10x10-plus-v0')
+#     for episode in range(2):
+#         done = False
+#         env.reset()
+#         while not done:
+#             output = env.step(env.action_space.sample())
+#             done = output[2]
+#             env.render('human')
diff --git a/fast_rl/tests/test_Interpretation.py b/fast_rl/tests/test_Interpretation.py
index 8412722..139597f 100644
--- a/fast_rl/tests/test_Interpretation.py
+++ b/fast_rl/tests/test_Interpretation.py
@@ -1,52 +1,2 @@
-from typing import Collection
 
-from fastai.basic_data import DatasetType
 
-from fast_rl.agents.DDPG import DDPG
-from fast_rl.agents.DQN import DQN, FixedTargetDQN
-from fast_rl.core.Interpreter import AgentInterpretationAlpha
-from fast_rl.core.Learner import AgentLearner
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_IMAGE, FEED_TYPE_STATE
-
-
-# def test_interpretation_heatmap():
-#     data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100,
-#                                  feed_type=FEED_TYPE_STATE, add_valid=False, memory_management_strategy='non')
-#     model = DQN(data, batch_size=8)
-#     learn = AgentLearner(data, model)
-#
-#     learn.fit(5)
-#     interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train)
-#     interp.plot_heatmapped_episode(-1, action_index=0)
-
-
-def test_interpretation_plot_q_dqn_returns():
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', max_steps=100, render='human', add_valid=False,
-                                 memory_management_strategy='non')
-    model = DQN(data)
-    learn = AgentLearner(data, model)
-    learn.fit(5)
-    interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train)
-    interp.plot_heatmapped_episode(2)
-
-
-# def test_inerpretation_plot_model_accuracy_fixeddqn():
-#     data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False,
-#                                  memory_management_strategy='non')
-#     model = FixedTargetDQN(data, batch_size=64, max_episodes=100, copy_over_frequency=4)
-#     learn = AgentLearner(data, model)
-#
-#     learn.fit(5)
-#     interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train)
-#     interp.plot_agent_accuracy_density()
-
-
-# def test_interpretation_plot_q_density():
-#     data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False,
-#                                  memory_management_strategy='non')
-#     model = FixedTargetDQN(data, batch_size=128, max_episodes=100, copy_over_frequency=3, use_embeddings=True)
-#     learn = AgentLearner(data, model)
-#
-#     learn.fit(4)
-#     interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train)
-#     interp.plot_agent_accuracy_density()
diff --git a/fast_rl/tests/test_MDPDataBunch.py b/fast_rl/tests/test_MDPDataBunch.py
index 0d2bc75..7b5bf3c 100644
--- a/fast_rl/tests/test_MDPDataBunch.py
+++ b/fast_rl/tests/test_MDPDataBunch.py
@@ -1,80 +1,199 @@
+from functools import partial
+
 import gym
-from fastai.basic_data import DataBunch
-from fastai.basic_train import Learner, ItemLists
-from fastai.vision import ImageDataBunch
 import numpy as np
-
-from fast_rl.agents.DQN import FixedTargetDQN
-from fast_rl.core.Learner import AgentLearner
-from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSlice, MarkovDecisionProcessList, \
-    MDPDataBunch, MDPDataset
-from fast_rl.core.agent_core import ExperienceReplay
-
-
-def test_MarkovDecisionProcessDataBunch_init():
-    error_msg = 'state space is %s but should be %s'
-    error_msg2 = 'the datasets in the dataloader seem to be different from the data bunches datasets...'
-
-    max_steps = 50
-    # Create 2 itemlists
-    train_list = MDPDataset(gym.make('CartPole-v1'), max_steps=max_steps)
-    valid_list = MDPDataset(gym.make('CartPole-v1'), max_steps=max_steps)
-
-    env_databunch = MDPDataBunch.create(train_list, valid_list, num_workers=0)
-    epochs = 1
-
-    assert max_steps == len(train_list)
-    assert max_steps == len(train_list)
-    assert max_steps == len(env_databunch.train_dl)
-    assert max_steps == len(env_databunch.valid_dl)
-
-    for epoch in range(epochs):
-        for element in env_databunch.train_dl:
-            env_databunch.train_ds.actions = env_databunch.train_ds.env.action_space.sample()
-            current_s, actual_s = element.shape[1], train_list.env.observation_space.shape[0]
-            print(f'state {element} action {env_databunch.train_dl.dl.dataset.actions}')
-            assert current_s == actual_s, error_msg % (current_s, actual_s)
-            assert np.equal(env_databunch.train_dl.dl.dataset.actions, env_databunch.train_ds.actions), error_msg2
-
-        for element in env_databunch.valid_dl:
-            env_databunch.valid_ds.actions = env_databunch.valid_ds.env.action_space.sample()
-            current_s, actual_s = element.shape[1], valid_list.env.observation_space.shape[0]
-            print(f'state {element} action {env_databunch.valid_dl.dl.dataset.actions}')
-            assert current_s == actual_s, error_msg % (current_s, actual_s)
-            assert np.equal(env_databunch.valid_dl.dl.dataset.actions, env_databunch.valid_ds.actions), error_msg2
-
-
-def test_MDPDataset_MemoryManagement():
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False)
-    model = FixedTargetDQN(data, batch_size=128, max_episodes=10000, lr=0.001, copy_over_frequency=3,
-                           memory=ExperienceReplay(10000), discount=0.99, use_embeddings=True)
-    learn = AgentLearner(data, model, mem_strategy='k_top_best')
-
-    learn.fit(5)
-
-
-def test_MarkovDecisionProcessDataBunch_init_no_valid():
-    error_msg = 'state space is %s but should be %s'
-    error_msg2 = 'the datasets in the dataloader seem to be different from the data bunches datasets...'
-
-    max_steps = 50
-    # Create 2 itemlists
-    train_list = MDPDataset(gym.make('CartPole-v1'), max_steps=max_steps)
-
-    env_databunch = MDPDataBunch.create(train_list, num_workers=0)
-    env_databunch.valid_dl = None
-    epochs = 3
-
-    assert max_steps == len(train_list)
-    assert max_steps == len(train_list)
-    assert max_steps == len(env_databunch.train_dl)
-    assert env_databunch.valid_dl is None
-
-    for epoch in range(epochs):
-        print(f'epoch {epoch}')
-        for element in env_databunch.train_dl:
-            env_databunch.train_ds.actions = env_databunch.train_ds.env.action_space.sample()
-            current_s, actual_s = element.shape[1], train_list.env.observation_space.shape[0]
-            print(f'state {element} action {env_databunch.train_dl.dl.dataset.actions}')
-            assert current_s == actual_s, error_msg % (current_s, actual_s)
-            assert np.equal(env_databunch.train_dl.dl.dataset.actions, env_databunch.train_ds.actions), error_msg2
+import pytest
+import torch
+from fastai.basic_train import ItemLists
+from gym import error
+from gym.envs.algorithmic.algorithmic_env import AlgorithmicEnv
+from gym.envs.toy_text import discrete
+from gym.wrappers import TimeLimit
+
+from fast_rl.agents.DQN import DQN
+from fast_rl.core.Envs import Envs
+from fast_rl.core.MarkovDecisionProcess import Action, Bounds, State, MDPDataset, MDPDataBunch, MDPMemoryManager
+from fast_rl.core.basic_train import AgentLearner
+from fast_rl.util.exceptions import MaxEpisodeStepsMissingError
+from fast_rl.util.misc import list_in_str
+
+ENV_NAMES = Envs.get_all_latest_envs()
+
+
+def validate_item_list(item_list: ItemLists):
+    # Check items
+    for i, item in enumerate(item_list.items):
+        if item.done: assert not item_list.items[
+            i - 1].done, f'The dataset has duplicate "done\'s" that are consecutive.'
+        assert item.state.s is not None, f'The item: {item}\'s state is None'
+        assert item.state.s_prime is not None, f'The item: {item}\'s state prime is None'
+
+
+@pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+def test_mdp_clean_callback(env):
+    data = MDPDataBunch.from_env(env, render='rgb_array')
+    model = DQN(data)
+    learner = AgentLearner(data, model)
+    learner.fit(15)
+    data.train_ds.env.close()
+    data.valid_ds.env.close()
+    del learner
+#
+#
+# @pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+# def test_mdp_databunch(env):
+#     data = MDPDataBunch.from_env(env, add_valid=False, render='rgb_array')
+#     for i in range(5):
+#         for _ in data.train_ds:
+#             data.train_ds.action = Action(taken_action=data.train_ds.action.action_space.sample(),
+#                                           action_space=data.train_ds.action.action_space)
+#
+#     validate_item_list(data.train_ds.x)
+#     del data
+#
+#
+# @pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+# def test_mdp_dataset_iter(env):
+#     dataset = MDPDataset(gym.make(env), memory_manager = partial(MDPMemoryManager, strategy='k_partition_top'), bs=8,
+#                          render='rgb_array')
+#
+#     for epoch in range(5):
+#         for el in dataset:
+#             dataset.action.set_single_action(dataset.env.action_space.sample())
+#
+#     # Check items
+#     for i, item in enumerate(dataset.x.items):
+#         if item.done: assert not dataset.x.items[
+#             i - 1].done, f'The dataset has duplicate "done\'s" that are consecutive.'
+#         assert item.state.s is not None, f'The item: {item}\'s state is None'
+#         assert item.state.s_prime is not None, f'The item: {item}\'s state prime is None'
+#     del dataset
+#
+#
+# @pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+# def test_mdpdataset_init(env):
+#     try:
+#         init_env = gym.make(env)
+#     except error.DependencyNotInstalled as e:
+#         print(e)
+#         return
+#
+#     data = MDPDataset(init_env, None, 64, 'rgb_array')
+#
+#     try:
+#         max_steps = data.max_steps
+#         assert max_steps is not None, f'Max steps is None for env {env}'
+#     except MaxEpisodeStepsMissingError as e:
+#         return
+#
+#     envs_to_test = {
+#         'CartPole-v0': 200,
+#         'MountainCar-v0': 200,
+#         'maze-v0': 2000
+#     }
+#
+#     if env in envs_to_test:
+#         assert envs_to_test[env] == max_steps, f'Env {env} is the wrong step amount'
+#     del data
+#
+#
+# @pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+# def test_bound_init(env):
+#     try:
+#         init_env = gym.make(env)
+#     except error.DependencyNotInstalled as e:
+#         print(e)
+#         return
+#
+#     for bound in (Bounds(init_env.action_space), Bounds(init_env.observation_space)):
+#         if env.lower().__contains__('continuous'):
+#             assert bound.n_possible_values == np.inf, f'Env {env} is continuous, should have inf values.'
+#         if env.lower().__contains__('deterministic'):
+#             assert bound.n_possible_values != np.inf, f'Env {env} is deterministic, should have discrete values.'
+#     init_env.close()
+#
+# @pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+# def test_action_init(env):
+#     try:
+#         init_env = gym.make(env)
+#     except error.DependencyNotInstalled as e:
+#         print(e)
+#         return
+#
+#     taken_action = init_env.action_space.sample()
+#     raw_action = np.random.rand(len(Bounds(init_env.action_space).max))
+#     init_env.reset()
+#     _ = init_env.step(taken_action)
+#
+#     action = Action(taken_action=taken_action, raw_action=raw_action, action_space=init_env.action_space)
+#
+#     if list_in_str(env, ['mountaincar-', 'cartpole', 'pong']):
+#         assert any([action.taken_action.dtype in (int, torch.int, torch.int64)]), f'Action is wrong dtype {action}'
+#         assert any([action.raw_action.dtype in (float, torch.float32, torch.float64)]), f'Action is wrong dtype {action}'
+#     if list_in_str(env, ['carracing', 'pendulum']):
+#         assert any([action.taken_action.dtype in (float, torch.float32, torch.float64)]), f'Action is wrong dtype {action}'
+#         assert any([action.raw_action.dtype in (float, torch.float32, torch.float64)]), f'Action is wrong dtype {action}'
+#     init_env.close()
+#
+#
+# @pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+# def test_state_init(env):
+#     try:
+#         init_env = gym.make(env)
+#     except error.DependencyNotInstalled as e:
+#         print(e)
+#         return
+#
+#     taken_action = init_env.action_space.sample()
+#     state = init_env.reset()
+#     state_prime, reward, done, info = init_env.step(taken_action)
+#     State(state, state_prime, init_env.observation_space)
+#     init_env.close()
+#
+#
+# @pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+# def test_state_str(env):
+#     try:
+#         init_env = gym.make(env)
+#     except error.DependencyNotInstalled as e:
+#         print(e)
+#         return
+#
+#     render = 'rgb_array'
+#     if isinstance(init_env, TimeLimit) and isinstance(init_env.unwrapped, (AlgorithmicEnv, discrete.DiscreteEnv)):
+#         render = 'ansi' if render == 'rgb_array' else render
+#
+#     taken_action = init_env.action_space.sample()
+#     state = init_env.reset()
+#
+#     try:
+#         alt_s = init_env.render(render)
+#     except NotImplementedError:
+#         return
+#
+#     state_prime, reward, done, info = init_env.step(taken_action)
+#     alt_s_prime = init_env.render(render)
+#     State(state, state_prime, alt_s, alt_s_prime, init_env.observation_space).__str__()
+#     init_env.close()
+#
+#
+# # @pytest.mark.parametrize("env", sorted(['CartPole-v0', 'maze-random-5x5-v0']))
+# # def test_state_full_episode(env):
+# #     try:
+# #         init_env = gym.make(env)
+# #     except error.DependencyNotInstalled as e:
+# #         print(e)
+# #         return
+# #
+# #     done = False
+# #     state = init_env.reset()
+# #     while not done:
+# #         taken_action = init_env.action_space.sample()
+# #         alt_state = init_env.render('rgb_array')
+# #         state_prime, reward, done, info = init_env.step(taken_action)
+# #         alt_s_prime = init_env.render('rgb_array')
+# #         State(state, state_prime, alt_state, alt_s_prime, init_env.observation_space)
+# #         state = state_prime
+# #         if done:
+# #             assert state_prime is not None, 'State prime is None, this should not have happened.'
+# #     init_env.close()
diff --git a/fast_rl/tests/test_agent_core.py b/fast_rl/tests/test_agent_core.py
index 2a5dd32..c55c40e 100644
--- a/fast_rl/tests/test_agent_core.py
+++ b/fast_rl/tests/test_agent_core.py
@@ -1,32 +1,34 @@
-from fastai.basic_train import LearnerCallback, DatasetType
-from fastai.callback import Callback
-from fastai.tabular import tabular_learner
-from fastai.vision import cnn_learner, models
+import pytest
 
-import numpy as np
-from traitlets import List
-from typing import Collection
-
-from fast_rl.agents.BaseAgent import BaseAgent
 from fast_rl.agents.DDPG import DDPG
-from fast_rl.agents.DQN import DQN, FixedTargetDQN, DoubleDQN, DuelingDQN, DoubleDuelingDQN
-from fast_rl.core.Interpreter import AgentInterpretationAlpha
-from fast_rl.core.Learner import AgentLearner
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_STATE
-from fast_rl.core.agent_core import PriorityExperienceReplay, ExperienceReplay
+from fast_rl.agents.DQN import DQN, FixedTargetDQN
+from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
+from fast_rl.core.agent_core import PriorityExperienceReplay
+from fast_rl.core.basic_train import AgentLearner
 
 
 def test_priority_experience_replay():
     data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False)
     model = FixedTargetDQN(data, memory=PriorityExperienceReplay(1000))
     learn = AgentLearner(data, model)
-    learn.fit(5)
-
-
-# def test_fit_function_ddpg():
-#     data = MDPDataBunch.from_env('Pendulum-v0', render='human', max_steps=100, add_valid=False)
-#     model = DDPG(data, memory=PriorityExperienceReplay(1000))
-#     learn = AgentLearner(data, model)
-#     learn.fit(5)
+    learn.fit(3)
+    data.train_ds.env.close()
+
+
+@pytest.mark.parametrize("env", sorted(['CartPole-v0']))
+def test_databunch_dqn_fit(env):
+    data = MDPDataBunch.from_env(env)
+    model = DQN(data)
+    learner = AgentLearner(data=data, model=model)
+    learner.fit(3)
+    data.valid_ds.env.close()
+    data.train_ds.env.close()
+
+def test_fit_function_ddpg():
+    data = MDPDataBunch.from_env('Pendulum-v0', bs=4, render='human', max_steps=100, add_valid=False)
+    model = DDPG(data, memory=PriorityExperienceReplay(1000))
+    learn = AgentLearner(data, model)
+    learn.fit(3)
+    data.train_ds.env.close()
 
 
diff --git a/fast_rl/tests/test_ddpg_models.py b/fast_rl/tests/test_ddpg_models.py
index eaa29f6..b72a5c5 100644
--- a/fast_rl/tests/test_ddpg_models.py
+++ b/fast_rl/tests/test_ddpg_models.py
@@ -1,36 +1,28 @@
 from collections import Collection
+from functools import partial
+from itertools import product
 
 import pytest
 from fastai.basic_train import LearnerCallback
 
 from fast_rl.agents.DDPG import DDPG
 from fast_rl.core.Envs import Envs
-from fast_rl.core.Learner import AgentLearner
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
+from fast_rl.core.MarkovDecisionProcess import FEED_TYPE_IMAGE, FEED_TYPE_STATE, MDPDataBunch
 from fast_rl.core.agent_core import ExperienceReplay, OrnsteinUhlenbeck
+from fast_rl.core.basic_train import AgentLearner
 
+params_dqn = [DDPG]
+params_envs = ['Pendulum-v0', 'CarRacing-v0']
+params_state_format = [FEED_TYPE_STATE, FEED_TYPE_IMAGE]
 
-ENV_NAMES = Envs.get_all_latest_envs(exclude_key='pybullet')
 
-
-@pytest.mark.parametrize("env", sorted(ENV_NAMES))
-def test_all_ddpg(env):
-    data = MDPDataBunch.from_env(env, render='human', add_valid=False,
-                                 max_steps=50)
-    if data is None:
-        print(f'Env {env} is probably Mujoco... Add imports if you want and try on your own. Don\'t like '
-              f'proprietary engines like this. If you have any issues, feel free to make a PR!')
-        return
-    # If the action type is int, skip. While there is a capability of int actions spaces for DDPG,
-    # that functionality is more of a niche
-    if data.train_ds.env.action_space.dtype == int: return
-    # If the dtype is None, then it is most likely a Tuple action space. We will most likely open this
-    # functionality in the future.
-    if data.train_ds.env.action_space.dtype is None: return
-
-    # data = MDPDataBunch.from_env('MountainCarContinuous-v0', render='human')
-    model = DDPG(data=data, batch=8, memory=ExperienceReplay(200, reduce_ram=True),
-                 exploration_strategy=OrnsteinUhlenbeck(epsilon_start=1, epsilon_end=0.1, decay=0.0001, size=1,
-                                                        do_exploration=True, end_episode=450))
-    learn = AgentLearner(data, model)
-    learn.fit(5)
+@pytest.mark.parametrize(["env", "model", "s_format"], list(product(params_envs, params_dqn, params_state_format)))
+def test_ddpg_models(env, model, s_format):
+    model = partial(model, memory=ExperienceReplay(memory_size=1000, reduce_ram=True))
+    data = MDPDataBunch.from_env(env, render='rgb_array', max_steps=20, bs=4, add_valid=False, feed_type=s_format)
+    learn = AgentLearner(data, model(data))
+    learn.fit(3)
+    data.train_ds.env.close()
+    del learn
+    del model
+    del data
diff --git a/fast_rl/tests/test_dqn_models.py b/fast_rl/tests/test_dqn_models.py
index cc0fcd9..60a3689 100644
--- a/fast_rl/tests/test_dqn_models.py
+++ b/fast_rl/tests/test_dqn_models.py
@@ -1,83 +1,32 @@
+from functools import partial
+from itertools import  product
 
-def test_basic_dqn_model_maze():
-    from fast_rl.agents.DQN import DQN
-    from fast_rl.core.Learner import AgentLearner
-    from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100)
-    model = DQN(data)
-    learn = AgentLearner(data, model)
+import pytest
 
-    learn.fit(5)
+from fast_rl.agents.DQN import DQN, FixedTargetDQN, DoubleDQN, DuelingDQN, DoubleDuelingDQN
+from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_STATE, FEED_TYPE_IMAGE
+from fast_rl.core.agent_core import ExperienceReplay
+from fast_rl.core.basic_train import AgentLearner
 
 
-def test_fixed_target_dqn_model_maze():
-    from fast_rl.agents.DQN import FixedTargetDQN
-    from fast_rl.core.Learner import AgentLearner
-    from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-    print('\n')
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False)
-    model = FixedTargetDQN(data)
-    learn = AgentLearner(data, model)
-
-    learn.fit(5)
-
-def test_fixed_target_dqn_model_cartpole():
-    from fast_rl.agents.DQN import FixedTargetDQN
-    from fast_rl.core.Learner import AgentLearner
-    from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-    from fast_rl.core.agent_core import ExperienceReplay
-    print('\n')
-    data = MDPDataBunch.from_env('CartPole-v1', render='human', max_steps=100, add_valid=False)
-    model = FixedTargetDQN(data, memory=ExperienceReplay(memory_size=100000, reduce_ram=True))
-    learn = AgentLearner(data, model)
-
-    learn.fit(5)
+params_dqn = [DuelingDQN, DoubleDQN, DQN, FixedTargetDQN, DoubleDuelingDQN]
+params_envs = ['CartPole-v0', 'MountainCar-v0', 'Pong-v0']
+params_state_format = [FEED_TYPE_STATE, FEED_TYPE_IMAGE]
 
 
-def test_fixed_target_dqn_no_explore_model_maze():
-    from fast_rl.agents.DQN import FixedTargetDQN
-    from fast_rl.core.Learner import AgentLearner
-    from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-    from fast_rl.core.agent_core import GreedyEpsilon
+@pytest.mark.parametrize(["env", "model", "s_format"], list(product(params_envs, params_dqn, params_state_format)))
+def test_dqn_models(env, model, s_format):
+    model = partial(model, memory=ExperienceReplay(memory_size=1000, reduce_ram=True))
     print('\n')
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False)
-    model = FixedTargetDQN(data, lr=0.01, discount=0.8,
-                           exploration_strategy=GreedyEpsilon(epsilon_start=0, epsilon_end=0,
-                                                                               decay=0.001, do_exploration=False))
-    learn = AgentLearner(data, model)
-
-    learn.fit(5)
-
-
-def test_double_dqn_model_maze():
-    from fast_rl.agents.DQN import DoubleDQN
-    from fast_rl.core.Learner import AgentLearner
-    from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100)
-    model = DoubleDQN(data, batch_size=8)
-    learn = AgentLearner(data, model)
-
-    learn.fit(5)
-
-
-def test_dueling_dqn_model_maze():
-    from fast_rl.agents.DQN import DuelingDQN
-    from fast_rl.core.Learner import AgentLearner
-    from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100)
-    model = DuelingDQN(data, batch_size=8)
-    learn = AgentLearner(data, model)
-    learn.fit(5)
 
+    data = MDPDataBunch.from_env(env, render='rgb_array', max_steps=20, bs=4, add_valid=False,
+                                 feed_type=s_format)
 
-def test_double_dueling_dqn_model_maze():
-    from fast_rl.agents.DQN import DoubleDuelingDQN
-    from fast_rl.core.Learner import AgentLearner
-    from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
+    learn = AgentLearner(data, model(data))
 
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100)
-    model = DoubleDuelingDQN(data)
-    learn = AgentLearner(data, model)
+    data.train_ds.env.close()
 
-    learn.fit(5)
+    learn.fit(3)
+    del learn
+    del model
+    del data
diff --git a/fast_rl/tests/test_metrics.py b/fast_rl/tests/test_metrics.py
index 046ba48..e69de29 100644
--- a/fast_rl/tests/test_metrics.py
+++ b/fast_rl/tests/test_metrics.py
@@ -1,10 +0,0 @@
-from fast_rl.agents.DQN import FixedTargetDQN
-from fast_rl.core.Learner import AgentLearner
-from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
-
-
-def test_epsilon():
-    data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100)
-    model = FixedTargetDQN(data, batch_size=8)
-    learn = AgentLearner(data, model)
-    learn.fit(5)
\ No newline at end of file
diff --git a/fast_rl/util/exceptions.py b/fast_rl/util/exceptions.py
new file mode 100644
index 0000000..28cfd80
--- /dev/null
+++ b/fast_rl/util/exceptions.py
@@ -0,0 +1,4 @@
+
+
+class MaxEpisodeStepsMissingError(Exception):
+    pass
\ No newline at end of file
diff --git a/fast_rl/util/misc.py b/fast_rl/util/misc.py
index d0c3051..4ca9ecf 100644
--- a/fast_rl/util/misc.py
+++ b/fast_rl/util/misc.py
@@ -13,6 +13,11 @@ class b_colors:
     UNDERLINE = '\033[4m'
 
 
+def list_in_str(s: str, str_list: list, make_lower=True):
+    if make_lower: s = s.lower()
+    return any([s.__contains__(el) for el in str_list])
+
+
 def is_goal_env(env, suppress_errors=True):
     msg = 'GoalEnv requires the "{}" key to be part of the observation d.'
     # Enforce that each GoalEnv uses a Goal-compatible observation space.
diff --git a/fast_rl/util/random_thingy.py b/fast_rl/util/random_thingy.py
index c2edfef..a82e664 100644
--- a/fast_rl/util/random_thingy.py
+++ b/fast_rl/util/random_thingy.py
@@ -5,25 +5,12 @@
 interp.plot_heatmapped_episode(-1)
 
 """
-from fastai.basic_data import DatasetType
-from fast_rl.agents.DDPG import DDPG
+from fast_rl.core.basic_train import AgentLearner
 from fast_rl.agents.DQN import FixedTargetDQN
-from fast_rl.core.Learner import AgentLearner
 from fast_rl.core.MarkovDecisionProcess import MDPDataBunch
+from fast_rl.core.agent_core import ExperienceReplay
 
-from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck, ExperienceReplay
-from fast_rl.core.metrics import EpsilonMetric
-
-data = MDPDataBunch.from_env('CartPole-v1', render='human', add_valid=False)
-# data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False)
-# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False)
-
-model = FixedTargetDQN(data, batch_size=64, memory=ExperienceReplay(memory_size=40000, reduce_ram=True),
-                       exploration_strategy=GreedyEpsilon(decay=0.001, epsilon_start=1, epsilon_end=0.01))
-# model = DDPG(data=data, batch=128, memory=ExperienceReplay(40000, reduce_ram=True),
-#              exploration_strategy=OrnsteinUhlenbeck(epsilon_start=1, epsilon_end=0.1, decay=0.0001, size=1,
-#                                                     do_exploration=True, end_episode=450))
-
-learn = AgentLearner(data, model)#, metrics=[EpsilonMetric])
-
+data = MDPDataBunch.from_env('Pong-v0', render='human', max_steps=100, add_valid=False)
+model = FixedTargetDQN(data, memory=ExperienceReplay(memory_size=100000, reduce_ram=True))
+learn = AgentLearner(data, model)
 learn.fit(450)
\ No newline at end of file