From bc412b727a276e75154af577d622fbbdad2e9b2f Mon Sep 17 00:00:00 2001 From: josiahls Date: Fri, 25 Oct 2019 22:31:20 -0400 Subject: [PATCH] Version 0_8_0: Stable (#9) * Init new branch * Added: * Action object: For validating action sizes / dims, bundling important info * State: For validating state sizes / dims, bundling important info * Bounds: For determining the dtypes of its parent object, and determining if the object is discrete * Initial mass refactor of MDPDataset * Max Step expectation * Fixed: * MDPDataset episode iteration Notes: * Plan to add a generic MDPStep list validation function. A few things to expect: - There should never be 2 "done" steps in a row - Right after a done step, the step counter should show 0. - There should never be Nones in the state values. - Need to check for bad copies * Added: * WrapperLossFunc for compatibility with the existing fastai fit function * native fastai fit function compatibility :)))))))) * Single Learner that subclasses fastai Learner Notes: * Next Commit will have the old code removed * Fixed: * Memory handler. For now keeping k top. Less confusing implementation. * Removed old code Notes: * Need to now revise the DQN's and DDPG's to be compatible. * Fixed: * DDPG compat * Other DQN compat Removed: * nn and cnn generic functions for now. They were just way too confusing :( . The DDPG works now also. * Interpreter test code. Notes: * Interpreter is broken for now * Redo Tests --- README.md | 41 +- azure-pipelines.yml | 2 +- docs_src/rl.agents.dqnfixedtarget.ipynb | 10 +- docs_src/rl.core.mdp_interpreter.ipynb | 12 +- fast_rl/agents/BaseAgent.py | 78 +- fast_rl/agents/DDPG.py | 127 ++-- fast_rl/agents/DQN.py | 237 +++---- fast_rl/core/Envs.py | 19 +- fast_rl/core/Interpreter.py | 18 +- fast_rl/core/Learner.py | 37 - fast_rl/core/MarkovDecisionProcess.py | 900 ++++++++++++------------ fast_rl/core/agent_core.py | 107 +-- fast_rl/core/basic_train.py | 31 + fast_rl/tests/test_DataBunch.py | 40 +- fast_rl/tests/test_Envs.py | 66 +- fast_rl/tests/test_Interpretation.py | 50 -- fast_rl/tests/test_MDPDataBunch.py | 275 ++++++-- fast_rl/tests/test_agent_core.py | 46 +- fast_rl/tests/test_ddpg_models.py | 42 +- fast_rl/tests/test_dqn_models.py | 93 +-- fast_rl/tests/test_metrics.py | 10 - fast_rl/util/exceptions.py | 4 + fast_rl/util/misc.py | 5 + fast_rl/util/random_thingy.py | 23 +- 24 files changed, 1040 insertions(+), 1233 deletions(-) delete mode 100644 fast_rl/core/Learner.py create mode 100644 fast_rl/core/basic_train.py create mode 100644 fast_rl/util/exceptions.py diff --git a/README.md b/README.md index 2803e12..4752e34 100644 --- a/README.md +++ b/README.md @@ -70,41 +70,10 @@ known environments. Prior to 1.0.0, new changes might break previous code versio working at their best. Post 1.0.0 will be more formal feature development with CI, unit testing, etc. **Critical** -- [X] 0.0.0 MDPDataBunch: Finished to the point of being useful. Please reference: `tests/test_Envs` -Example: -```python -from fast_rl.core.Envs import Envs -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - -# At present will try to load OpenAI, box2d, pybullet, atari, maze. -# note "get_all_latest_envs" has a key inclusion and exclusion so if you don't have some of these envs installed, -# you can avoid this here. Certain envs just flat out do not work / are unusual. You are welcome to see how to get them -# working. -for env in Envs.get_all_latest_envs(): - max_steps = 50 # Limit the number of per episode iterations for now. - print(f'Testing {env}') - mdp_databunch = MDPDataBunch.from_env(env, max_steps=max_steps, num_workers=0) - if mdp_databunch is None: - print(f'Env {env} is probably Mujoco... Add imports if you want and try on your own. Don\'t like ' - f'proprietary engines like this. If you have any issues, feel free to make a PR!') - else: - epochs = 1 # N episodes to run - for epoch in range(epochs): - for state in mdp_databunch.train_dl: - # Instead of random action, you would have your agent here - mdp_databunch.train_ds.actions = mdp_databunch.train_ds.get_random_action() - - for state in mdp_databunch.valid_dl: - # Instead of random action, you would have your agent here and have exploration to 0 - mdp_databunch.valid_ds.actions = mdp_databunch.valid_ds.get_random_action() -``` -- [X] 0.1.0 DQN Agent: Reference `tests/test_Learner/test_basic_dqn_model_maze`. We use Learner callbacks for -handling the different fit behaviors. - Testable code: ```python from fast_rl.agents.DQN import DQN -from fast_rl.core.Learner import AgentLearner +from fast_rl.core.basic_train import AgentLearner from fast_rl.core.MarkovDecisionProcess import MDPDataBunch data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human') @@ -130,13 +99,15 @@ Usage example: ```python from fast_rl.agents.DQN import DQN from fast_rl.core.Interpreter import AgentInterpretationAlpha -from fast_rl.core.Learner import AgentLearner +from fast_rl.core.basic_train import AgentLearner from fast_rl.core.MarkovDecisionProcess import MDPDataBunch data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human') model = DQN(data) learn = AgentLearner(data, model) learn.fit(10) + +# Note that the Interpretation is broken, will be fixed with documentation in 0.9 interp = AgentInterpretationAlpha(learn) interp.plot_heatmapped_episode(5) ``` @@ -229,8 +200,8 @@ learn.fit(5) reset commit - [X] 0.7.0 Full test suite using multi-processing. Connect to CI. -- [ ] **Working On** 0.8.0 Comprehensive model eval **debug/verify**. Each model should succeed at at least a few known environments. Also, massive refactoring will be needed. -- [ ] 0.9.0 Notebook demonstrations of basic model usage. +- [X] 0.8.0 Comprehensive model eval **debug/verify**. Each model should succeed at at least a few known environments. Also, massive refactoring will be needed. +- [ ] **Working on** 0.9.0 Notebook demonstrations of basic model usage. - [ ] **1.0.0** Base version is completed with working model visualizations proving performance / expected failure. At this point, all models should have guaranteed environments they should succeed in. - [ ] 1.2.0 Add PyBullet Fetch Environments diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7db0446..f691bc4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -30,7 +30,7 @@ steps: displayName: 'Install Python Packages' - script: | - xvfb-run -s "-screen 0 1400x900x24" pytest -n 8 fast_rl/tests --doctest-modules --junitxml=junit/test-results.xml --cov=./ --cov-report=xml --cov-report=html + xvfb-run -s "-screen 0 1400x900x24" pytest -n 2 fast_rl/tests --doctest-modules --junitxml=junit/test-results.xml --cov=./ --cov-report=xml --cov-report=html displayName: 'Test with pytest' - task: PublishTestResults@2 diff --git a/docs_src/rl.agents.dqnfixedtarget.ipynb b/docs_src/rl.agents.dqnfixedtarget.ipynb index dd7e837..e3b1a7f 100644 --- a/docs_src/rl.agents.dqnfixedtarget.ipynb +++ b/docs_src/rl.agents.dqnfixedtarget.ipynb @@ -15,10 +15,10 @@ "import fast_rl.agents.DQN \n", "from fast_rl.agents.DQN import DQN, FixedTargetDQN, DoubleDQN, DuelingDQN, DoubleDuelingDQN\n", "from fast_rl.core.Interpreter import AgentInterpretationAlpha\n", - "from fast_rl.core.Learner import AgentLearner\n", - "from fast_rl.core.MarkovDecisionProcess import MDPDataBunch\n", + "from fast_rl.core.Learner import AgentLearnerAlpha\n", + "from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha\n", "from fast_rl.core.agent_core import PriorityExperienceReplay, ExperienceReplay\n", - "from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_IMAGE, FEED_TYPE_STATE\n", + "from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha, FEED_TYPE_IMAGE, FEED_TYPE_STATE\n", "from fast_rl.core.agent_core import ExperienceReplay, GreedyEpsilon\n", "import sys\n", "import importlib" @@ -423,12 +423,12 @@ } ], "source": [ - "data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000)\n", + "data = MDPDataBunchAlpha.from_env('maze-random-5x5-v0', render='human', max_steps=1000)\n", "model = FixedTargetDQN(data, batch_size=128, max_episodes=50, lr=0.001, copy_over_frequency=3,\n", " memory=ExperienceReplay(10000), discount=0.99, \n", " exploration_strategy=GreedyEpsilon(epsilon_start=1, epsilon_end=0.1,\n", " decay=0.001, do_exploration=True))\n", - "learn = AgentLearner(data, model)\n", + "learn = AgentLearnerAlpha(data, model)\n", "\n", "learn.fit(50)" ] diff --git a/docs_src/rl.core.mdp_interpreter.ipynb b/docs_src/rl.core.mdp_interpreter.ipynb index c7e38aa..315ead8 100644 --- a/docs_src/rl.core.mdp_interpreter.ipynb +++ b/docs_src/rl.core.mdp_interpreter.ipynb @@ -79,14 +79,14 @@ "import numpy as np\n", "\n", "from fast_rl.agents.DQN import DQN\n", - "from fast_rl.core.Learner import AgentLearner\n", - "from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, MDPDataset\n", + "from fast_rl.core.Learner import AgentLearnerAlpha\n", + "from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha, MDPDatasetAlpha\n", "from fast_rl.core.Interpreter import AgentInterpretationAlpha\n", "%matplotlib inline\n", " \n", - "data = MDPDataBunch.from_env('CartPole-v1', render='human', bs=64)\n", + "data = MDPDataBunchAlpha.from_env('CartPole-v1', render='human', bs=64)\n", "model = DQN(data)\n", - "learn = AgentLearner(data, model)\n", + "learn = AgentLearnerAlpha(data, model)\n", "\n", "learn.fit(5)" ] @@ -106,9 +106,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = MDPDataBunch.from_pickle('CartPole-v1', render='human', bs=64)\n", + "data = MDPDataBunchAlpha.from_pickle('CartPole-v1', render='human', bs=64)\n", "model = DQN(data)\n", - "learn = AgentLearner(data, model)" + "learn = AgentLearnerAlpha(data, model)" ] }, { diff --git a/fast_rl/agents/BaseAgent.py b/fast_rl/agents/BaseAgent.py index d6a7a28..f577e7e 100644 --- a/fast_rl/agents/BaseAgent.py +++ b/fast_rl/agents/BaseAgent.py @@ -1,15 +1,12 @@ from math import floor +from typing import Collection -import gym +import numpy as np import torch -from fastai.basic_train import LearnerCallback, Any -from fastai.callback import Callback +from fastai.basic_train import LearnerCallback from fastai.layers import bn_drop_lin from gym.spaces import Discrete, Box from torch import nn -from traitlets import List -import numpy as np -from typing import Collection from fast_rl.core.MarkovDecisionProcess import MDPDataBunch from fast_rl.core.agent_core import ExplorationStrategy @@ -29,6 +26,7 @@ def __init__(self, data: MDPDataBunch): self.loss = None self.out = None self.opt = None + self.warming_up = False self.learner_callbacks = [] # type: Collection[LearnerCallback] # Root model that will be accessed for action decisions self.action_model = None # type: nn.Module @@ -77,39 +75,6 @@ def forward(self, x): return x.view(x.size(0), -1) - - -def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=False, - activation_function=None, final_activation_function=None, action_val_to_dim=True): - """Generates an nn module. - - Notes: - TabularModel could possibly be used along side a cnn learner instead. Will be a good idea to investigate. - - Returns: - - """ - act = nn.LeakyReLU if activation_function is None else activation_function - # For now the dimension of the action does not make a difference. - action_size = action_size[0] if not action_val_to_dim else action_size[1] - # For now keep drop out as 0, test including dropout later - ps = [0] * len(layer_list) - sizes = [state_size] + layer_list + [action_size] - actns = [act() for _ in range(len(sizes) - 2)] + [None] - layers = [] - for i, (n_in, n_out, dp, act) in enumerate(zip(sizes[:-1], sizes[1:], [0.] + ps, actns)): - if i == 0 and use_embed: - embedded, n_in = get_embedded(n_in[0], n_out, n_in[1], 5) - layers += [ToLong(), embedded, Flatten()] - elif i == 0: n_in = n_in[0] - if i == 0 and use_bn: layers += [nn.BatchNorm1d(n_in)] - - layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act) - - if final_activation_function is not None: layers += [final_activation_function()] - return nn.Sequential(*layers) - - def get_next_conv_shape(c_w, c_h, stride, kernel_size): h = floor((c_h - kernel_size - 2) / stride) + 1 # 3 convolutional layers given (3c, 640w, 640h) w = floor((c_w - kernel_size - 2) / stride) + 1 @@ -117,7 +82,7 @@ def get_next_conv_shape(c_w, c_h, stride, kernel_size): def get_conv(input_tuple, act, kernel_size, stride, n_conv_layers, layers): - """ + r""" Useful guideline for convolutional net shape change: Shape: @@ -141,37 +106,12 @@ def get_conv(input_tuple, act, kernel_size, stride, n_conv_layers, layers): n_conv_layers: layers: """ - h, w = input_tuple[0], input_tuple[1] + h, w = input_tuple[1], input_tuple[2] conv_layers = [SwapImageChannel()] for i in range(n_conv_layers): h, w = get_next_conv_shape(h, w, stride, kernel_size) - conv_layers.append(torch.nn.Conv2d(input_tuple[2], 3, kernel_size=kernel_size, stride=stride)) + conv_layers.append(torch.nn.Conv2d(input_tuple[3], 3, kernel_size=kernel_size, stride=stride)) conv_layers.append(act) - return layers + conv_layers, 3 * (h + 1) * (w + 1) - -def create_cnn_model(layer_list: list, action_size, state_size, use_bn=False, kernel_size=5, stride=3, n_conv_layers=3, - activation_function=None, final_activation_function=None, action_val_to_dim=True): - """Generates an nn module. - - Notes: - TabularModel could possibly be used along side a cnn learner instead. Will be a good idea to investigate. - - Returns: - - """ - act = nn.LeakyReLU if activation_function is None else activation_function - # For now keep drop out as 0, test including dropout later - ps = [0] * len(layer_list) - action_size = action_size[0] if not action_val_to_dim else action_size[1] - sizes = [state_size[0]] + layer_list + [action_size] - actns = [act() for _ in range(n_conv_layers + len(sizes) - 2)] + [None] - layers = [] - for i, (n_in, n_out, dp, act) in enumerate(zip(sizes[:-1], sizes[1:], [0.] + ps, actns)): - if type(n_in) == tuple: - layers, n_in = get_conv(n_in, act, kernel_size, n_conv_layers=n_conv_layers, layers=layers, stride=stride) - layers += [Flatten()] - - layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act) - if final_activation_function is not None: layers += [final_activation_function()] - return nn.Sequential(*layers) \ No newline at end of file + output_size = torch.prod(torch.tensor(nn.Sequential(*(layers + conv_layers))(torch.rand(input_tuple)).shape)) + return layers + conv_layers, output_size diff --git a/fast_rl/agents/DDPG.py b/fast_rl/agents/DDPG.py index 1f591ca..ac523b1 100644 --- a/fast_rl/agents/DDPG.py +++ b/fast_rl/agents/DDPG.py @@ -1,18 +1,28 @@ from copy import deepcopy -import torch -from fastai.basic_train import LearnerCallback, Any, OptimWrapper, ifnone, F import numpy as np -from fastai.metrics import RMSE +import torch +from fastai.basic_train import LearnerCallback, Any, OptimWrapper, ifnone from torch import nn from torch.nn import MSELoss from torch.optim import Adam -from fast_rl.agents.BaseAgent import BaseAgent, create_nn_model, create_cnn_model, get_next_conv_shape, get_conv, \ +from fast_rl.agents.BaseAgent import BaseAgent, get_conv, \ Flatten -from fast_rl.core.Learner import AgentLearner -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch -from fast_rl.core.agent_core import GreedyEpsilon, ExperienceReplay +from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, Action, State +from fast_rl.core.agent_core import ExperienceReplay, OrnsteinUhlenbeck + + +def get_action_ddpg_cnn(layers, action: Action, state: State, activation=nn.ReLU, kernel_size=5, stride=2): + module_layers, out_size = get_conv(state.s.shape, activation(), kernel_size=kernel_size, stride=stride, + n_conv_layers=3, layers=[]) + module_layers += [Flatten()] + layers.append(action.taken_action.shape[1]) + for i, layer in enumerate(layers): + module_layers += [nn.Linear(out_size, layer)] if i == 0 else [nn.Linear(layers[i - 1], layer)] + module_layers += [activation()] + + return nn.Sequential(*module_layers) class BaseDDPGCallback(LearnerCallback): @@ -28,8 +38,6 @@ def on_train_begin(self, n_epochs, **kwargs: Any): def on_epoch_begin(self, epoch, **kwargs: Any): self.episode = epoch - # if self.learn.model.training and self.iteration != 0: - # self.learn.model.memory.update(item=self.learn.data.x.items[-1]) self.iteration = 0 def on_loss_begin(self, **kwargs: Any): @@ -41,21 +49,42 @@ def on_loss_begin(self, **kwargs: Any): post_optimize = self.learn.model.optimize() if self.learn.model.training: self.learn.model.memory.refresh(post_optimize=post_optimize) - # if self.iteration % self.copy_over_frequency == 0: self.learn.model.target_copy_over() self.iteration += 1 - # - # def on_epoch_end(self, **kwargs: Any): - # if self.episode % self.copy_over_frequency == 0: - # self.learn.model.target_copy_over() + + +class NNActor(nn.Module): + def __init__(self, layers, action: Action, state: State, activation=nn.ReLU, embed=False): + super().__init__() + layers += [action.taken_action.shape[1]] + module_layers = [] + + for i, layer in enumerate(layers): + module_layers.append(nn.Linear(state.s.shape[1] if i == 0 else layers[i - 1], layer)) + if i != len(layers) - 1: module_layers.append(activation()) + + module_layers += [nn.Tanh()] + self.model = nn.Sequential(*module_layers) + + def forward(self, x): + return self.model(x) + + +class CNNActor(nn.Module): + def __init__(self, layers, action: Action, state: State, activation=nn.ReLU): + super().__init__() + # This is still some complete overlap in nn builders, for here, the default function has everything we need + self.model = get_action_ddpg_cnn(layers, action, state, activation=activation, kernel_size=5, stride=2) + + def forward(self, x): + return self.model(x) class NNCritic(nn.Module): - def __init__(self, layer_list: list, action_size, state_size, use_bn=False, use_embed=True, - activation_function=None): + def __init__(self, layer_list: list, action: Action, state: State): super().__init__() - self.action_size = action_size[0] - self.state_size = state_size[0] + self.action_size = action.taken_action.shape[1] + self.state_size = state.s.shape[1] self.fc1 = nn.Linear(self.state_size, layer_list[0]) self.fc2 = nn.Linear(layer_list[0] + self.action_size, layer_list[1]) @@ -72,10 +101,10 @@ def forward(self, x): class CNNCritic(nn.Module): - def __init__(self, layer_list: list, action_size, state_size, activation_function=None): + def __init__(self, action: Action, state: State): super().__init__() - self.action_size = action_size[0] - self.state_size = state_size[0] + self.action_size = action.taken_action.shape[1] + self.state_size = state.s.shape layers = [] layers, input_size = get_conv(self.state_size, nn.LeakyReLU(), 8, 2, 3, layers) @@ -97,15 +126,15 @@ def forward(self, x): class DDPG(BaseAgent): - def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount=0.99, + def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, discount=0.99, lr=1e-3, actor_lr=1e-4, exploration_strategy=None): """ - Implementation of a continuous control algorithm using an actor/critic architecture. + Implementation of a discrete control algorithm using an actor/critic architecture. Notes: Uses 4 networks, 2 actors, 2 critics. All models use batch norm for feature invariance. - NNCritic simply predicts Q while the Actor proposes the actions to take given a state s. + NNCritic simply predicts Q while the Actor proposes the actions to take given a s s. References: [1] Lillicrap, Timothy P., et al. "Continuous control with deep reinforcement learning." @@ -115,7 +144,6 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount data: Primary data object to use. memory: How big the memory buffer will be for offline training. tau: Defines how "soft/hard" we will copy the target networks over to the primary networks. - batch: Size of per memory query. discount: Determines the amount of discounting the existing Q reward. lr: Rate that the opt will learn parameter gradients. """ @@ -123,8 +151,9 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount self.name = 'DDPG' self.lr = lr self.discount = discount - self.batch = batch self.tau = 1 + self.warming_up = True + self.batch_size = data.train_ds.bs self.memory = ifnone(memory, ExperienceReplay(10000)) self.action_model = self.initialize_action_model([400, 300], data) @@ -143,28 +172,23 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount self.loss_func = MSELoss() - self.exploration_strategy = ifnone(exploration_strategy, GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, - decay=0.001, - do_exploration=self.training)) + self.exploration_strategy = ifnone(exploration_strategy, OrnsteinUhlenbeck(size=data.action.taken_action.shape, + epsilon_start=1, epsilon_end=0.1, + decay=0.001, + do_exploration=self.training)) def initialize_action_model(self, layers, data): - actions, state = data.get_action_state_size() - if type(state[0]) is tuple and len(state[0]) == 3: - # actions, state = actions[0], state[0] - # If the shape has 3 dimensions, we will try using cnn's instead. - return create_cnn_model([200, 200], actions, state, False, kernel_size=8, - final_activation_function=nn.Tanh, action_val_to_dim=False) + if len(data.state.s.shape) == 4 and data.state.s.shape[-1] < 4: + return CNNActor(layers, data.action, data.state) else: - return create_nn_model(layers, *data.get_action_state_size(), False, use_embed=data.train_ds.embeddable, - final_activation_function=nn.Tanh, action_val_to_dim=False) + return NNActor(layers, data.action, data.state) def initialize_critic_model(self, layers, data): - """ Instead of state -> action, we are going state + action -> single expected reward. """ - actions, state = data.get_action_state_size() - if type(state[0]) is tuple and len(state[0]) == 3: - return CNNCritic(layers, *data.get_action_state_size()) + """ Instead of s -> action, we are going s + action -> single expected reward. """ + if len(data.state.s.shape) == 4 and data.state.s.shape[-1] < 4: + return CNNCritic(data.action, data.state) else: - return NNCritic(layers, *data.get_action_state_size()) + return NNCritic(layers, data.action, data.state) def pick_action(self, x): if self.training: self.action_model.eval() @@ -174,7 +198,7 @@ def pick_action(self, x): return np.clip(action, -1, 1) def optimize(self): - """ + r""" Performs separate updates to the actor and critic models. Get the predicted yi for optimizing the actor: @@ -187,14 +211,17 @@ def optimize(self): Returns: """ - if len(self.memory) > self.batch: + if len(self.memory) > self.batch_size: + self.warming_up = False # Perhaps have memory as another item list? Should investigate. - sampled = self.memory.sample(self.batch) + sampled = self.memory.sample(self.batch_size) - r = torch.from_numpy(np.array([item.reward for item in sampled]).astype(float)).float() - s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float() - s = torch.from_numpy(np.array([item.current_state for item in sampled])).float() - a = torch.from_numpy(np.array([item.actions for item in sampled]).astype(float)).float() + with torch.no_grad(): + r = torch.cat([item.reward.float() for item in sampled]) + s_prime = torch.cat([item.s_prime.float() for item in sampled]) + s = torch.cat([item.s.float() for item in sampled]) + a = torch.cat([item.a.float() for item in sampled]) + # d = torch.cat([item.done.float() for item in sampled]) # Do we need a mask?? with torch.no_grad(): y = r + self.discount * self.t_critic_model((s_prime, self.t_action_model(s_prime))) @@ -234,7 +261,7 @@ def target_copy_over(self): def soft_target_copy_over(self, t_m, f_m, tau): for target_param, local_param in zip(t_m.parameters(), f_m.parameters()): - target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) + target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def interpret_q(self, items): with torch.no_grad(): diff --git a/fast_rl/agents/DQN.py b/fast_rl/agents/DQN.py index 462bc62..c6edd2f 100644 --- a/fast_rl/agents/DQN.py +++ b/fast_rl/agents/DQN.py @@ -1,20 +1,19 @@ -import copy from copy import deepcopy from functools import partial +from typing import Tuple -import numpy as np import torch from fastai.basic_train import LearnerCallback, Any, F, OptimWrapper, ifnone from torch import optim, nn -from fast_rl.agents.BaseAgent import BaseAgent, create_nn_model, create_cnn_model, ToLong, get_embedded, Flatten -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_IMAGE +from fast_rl.agents.BaseAgent import BaseAgent, ToLong, get_embedded, Flatten, get_conv +from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, MDPDataset, State, Action from fast_rl.core.agent_core import ExperienceReplay, GreedyEpsilon class BaseDQNCallback(LearnerCallback): def __init__(self, learn, max_episodes=None): - """Handles basic DQN end of step model optimization.""" + r"""Handles basic DQN end of step model optimization.""" super().__init__(learn) self.n_skipped = 0 self._persist = max_episodes is not None @@ -33,11 +32,8 @@ def on_epoch_begin(self, epoch, **kwargs: Any): self.iteration = 0 def on_loss_begin(self, **kwargs: Any): - """Performs memory updates, exploration updates, and model optimization.""" - if self.learn.model.training and self.previous_item is not None: - if self.learn.data.x.items[-2].done: self.previous_item.done = self.learn.data.x.items[-2].done - self.learn.model.memory.update(item=self.previous_item) - self.previous_item = copy.deepcopy(self.learn.data.x.items[-1]) + r"""Performs memory updates, exploration updates, and model optimization.""" + if self.learn.model.training: self.learn.model.memory.update(item=self.learn.data.train_ds.x.items[-1]) self.learn.model.exploration_strategy.update(self.episode, max_episodes=self.max_episodes, do_exploration=self.learn.model.training) post_optimize = self.learn.model.optimize() @@ -45,8 +41,6 @@ def on_loss_begin(self, **kwargs: Any): self.iteration += 1 - - class FixedTargetDQNCallback(LearnerCallback): def __init__(self, learn, copy_over_frequency=3): """Handles updating the target model in a fixed target DQN. @@ -66,32 +60,38 @@ def on_step_end(self, **kwargs: Any): self.learn.model.target_copy_over() -class DQNActionNN(nn.Module): - def __init__(self, layers, action, state, activation=nn.ReLU, embed=False): - super().__init__() - - module_layers = [] - for i, size in enumerate(layers): - if i == 0: - if embed: - embedded, out = get_embedded(state[0], size, state[1], 5) - module_layers += [ToLong(), embedded, Flatten(), nn.Linear(out, size)] - else: - module_layers.append(nn.Linear(state[0], size)) +def get_action_dqn_fully_conn(layers, action: Action, state: State, activation=nn.ReLU, embed=False): + module_layers = [] + for i, size in enumerate(layers): + if i == 0: + if embed: + embedded, out = get_embedded(state.s.shape[1], size, state.n_possible_values, 5) + module_layers += [ToLong(), embedded, Flatten(), nn.Linear(out, size)] else: - module_layers.append(nn.Linear(layers[i-1], size)) - module_layers.append(activation()) + module_layers.append(nn.Linear(state.s.shape[1], size)) + else: + module_layers.append(nn.Linear(layers[i - 1], size)) + module_layers.append(activation()) + + module_layers.append(nn.Linear(layers[-1], action.n_possible_values)) + return nn.Sequential(*module_layers) - module_layers.append(nn.Linear(layers[-1], action[1])) - self.model = nn.Sequential(*module_layers) - def forward(self, x, **kwargs: Any): - return self.model(x) +def get_action_dqn_cnn(layers, action: Action, state: State, activation=nn.ReLU, kernel_size=5, stride=2): + module_layers, out_size = get_conv(state.s.shape, activation(), kernel_size=kernel_size, stride=stride, + n_conv_layers=3, layers=[]) + module_layers += [Flatten()] + layers.append(action.n_possible_values) + for i, layer in enumerate(layers): + module_layers += [nn.Linear(out_size, layer)] if i == 0 else [nn.Linear(layers[i - 1], layer)] + if i != len(layers) - 1: module_layers += [activation()] + + return nn.Sequential(*module_layers) class DQN(BaseAgent): - def __init__(self, data: MDPDataBunch, memory=None, batch_size=32, lr=0.01, discount=0.95, grad_clip=5, - max_episodes=None, exploration_strategy=None, use_embeddings=False): + def __init__(self, data: MDPDataBunch, memory=None, lr=0.01, discount=0.95, grad_clip=5, + max_episodes=None, exploration_strategy=None, use_embeddings=False, layers=None): """Trains an Agent using the Q Learning method on a neural net. Notes: @@ -105,16 +105,17 @@ def __init__(self, data: MDPDataBunch, memory=None, batch_size=32, lr=0.01, disc data: Used for size input / output information. """ super().__init__(data) - # TODO add recommend cnn based on state size? + # TODO add recommend cnn based on s size? self.name = 'DQN' self.use_embeddings = use_embeddings - self.batch_size = batch_size + self.batch_size = data.train_ds.bs self.discount = discount + self.warming_up = True self.lr = lr self.gradient_clipping_norm = grad_clip - self.loss_func = F.mse_loss #F.smooth_l1_loss + self.loss_func = F.mse_loss self.memory = ifnone(memory, ExperienceReplay(10000)) - self.action_model = self.initialize_action_model([24, 24], data) + self.action_model = self.initialize_action_model(ifnone(layers, [24, 24]), data.train_ds) self.opt = OptimWrapper.create(optim.Adam, lr=self.lr, layer_groups=[self.action_model]) self.learner_callbacks += [partial(BaseDQNCallback, max_episodes=max_episodes)] + self.memory.callbacks self.exploration_strategy = ifnone(exploration_strategy, GreedyEpsilon(epsilon_start=1, epsilon_end=0.1, @@ -127,10 +128,9 @@ def init_weights(self, m): m.bias.data.fill_(0.01) def initialize_action_model(self, layers, data): - # if self.data.train_ds.feed_type == FEED_TYPE_IMAGE: model = create_cnn_model(layers, *data.get_action_state_size(), action_val_to_dim=True) - # else: model = create_nn_model(layers, *data.get_action_state_size(), use_embed=False, action_val_to_dim=True) - model = DQNActionNN(layers, *data.get_action_state_size(), embed=self.use_embeddings) # type: nn.Module - + if len(data.state.s.shape) == 4 and data.state.s.shape[-1] < 4: + model = get_action_dqn_cnn(deepcopy(layers), data.action, data.state, kernel_size=5, stride=2) + else: model = get_action_dqn_fully_conn(deepcopy(layers), data.action, data.state, embed=self.use_embeddings) model.apply(self.init_weights) return model @@ -138,8 +138,27 @@ def forward(self, x): x = super(DQN, self).forward(x) return self.action_model(x) + def sample_mask(self) -> Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor]: + self.warming_up = False + # Perhaps have memory as another itemlist? Should investigate. + sampled = self.memory.sample(self.batch_size) + with torch.no_grad(): + r = torch.cat([item.reward.float() for item in sampled]) + s_prime = torch.cat([item.s_prime.float() for item in sampled]) + s = torch.cat([item.s.float() for item in sampled]) + a = torch.cat([item.a.long() for item in sampled]) + d = torch.cat([item.done.float() for item in sampled]) + + masking = torch.sub(1.0, d) + return r, s_prime, s, a, d, masking + + def calc_y_hat(self, s, a): return self.action_model(s).gather(1, a) + + def calc_y(self, s_prime, masking, r, y_hat): + return self.discount * self.action_model(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat) + def optimize(self): - """Uses ER to optimize the Q-net (without fixed targets). + r"""Uses ER to optimize the Q-net (without fixed targets). Uses the equation: @@ -149,22 +168,15 @@ def optimize(self): Returns (dict): Optimization information + """ if len(self.memory) > self.batch_size: - # Perhaps have memory as another itemlist? Should investigate. - sampled = self.memory.sample(self.batch_size) - with torch.no_grad(): - r = torch.from_numpy(np.array([item.reward for item in sampled])).float() - s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float() - s = torch.from_numpy(np.array([item.current_state for item in sampled])).float() - a = torch.from_numpy(np.array([item.actions for item in sampled])).long() - d = torch.from_numpy(np.array([item.done for item in sampled])).float() + r, s_prime, s, a, d, masking = self.sample_mask() - masking = torch.sub(1.0, d).unsqueeze(0) # Traditional `maze-random-5x5-v0` with have a model output a Nx4 output. # since r is just Nx1, we spread the reward into the actions. - y_hat = self.action_model(s).gather(1, a) - y = self.discount * self.action_model(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat) + y_hat = self.calc_y_hat(s, a) + y = self.calc_y(s_prime, masking, r, y_hat) loss = self.loss_func(y, y_hat) @@ -183,17 +195,14 @@ def optimize(self): def interpret_q(self, items): with torch.no_grad(): - r = torch.from_numpy(np.array([item.reward for item in items])).float() - s_prime = torch.from_numpy(np.array([item.result_state for item in items])).float() - s = torch.from_numpy(np.array([item.current_state for item in items])).float() - a = torch.from_numpy(np.array([item.actions for item in items])).long() - + s = torch.cat([item.s.float() for item in items]) + a = torch.cat([item.a.long() for item in items]) return self.action_model(s).gather(1, a) class FixedTargetDQN(DQN): - def __init__(self, data: MDPDataBunch, memory=None, tau=0.01, copy_over_frequency=3, **kwargs): - """Trains an Agent using the Q Learning method on a 2 neural nets. + def __init__(self, data: MDPDataBunch, memory=None, tau=0.01, copy_over_frequency=3, **kwargs): + r"""Trains an Agent using the Q Learning method on a 2 neural nets. Notes: Unlike the base DQN, this is a true reflection of ref [1]. We use 2 models instead of one to allow for @@ -213,60 +222,26 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=0.01, copy_over_frequenc self.learner_callbacks += [partial(FixedTargetDQNCallback, copy_over_frequency=copy_over_frequency)] def target_copy_over(self): - """ Updates the target network from calls in the FixedTargetDQNCallback callback.""" + r""" Updates the target network from calls in the FixedTargetDQNCallback callback.""" # self.target_net.load_state_dict(self.action_model.state_dict()) for target_param, local_param in zip(self.target_net.parameters(), self.action_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) - def optimize(self): - """Uses ER to optimize the Q-net. - + def calc_y(self, s_prime, masking, r, y_hat): + r""" Uses the equation: .. math:: Q^{*}(s, a) = \mathbb{E}_{s'∼ \Big\epsilon} \Big[r + \lambda \displaystyle\max_{a'}(Q^{*}(s' , a')) \;|\; s, a \Big] - - Returns (dict): Optimization information """ - if len(self.memory) > self.batch_size: - # Perhaps have memory as another item list? Should investigate. - sampled = self.memory.sample(self.batch_size) - - with torch.no_grad(): - r = torch.from_numpy(np.array([item.reward for item in sampled])).float() - s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float() - s = torch.from_numpy(np.array([item.current_state for item in sampled])).float() - a = torch.from_numpy(np.array([item.actions for item in sampled])).long() - d = torch.from_numpy(np.array([item.done for item in sampled])).float() - - # Traditional `maze-random-5x5-v0` with have a model output a Nx4 output. - # since r is just Nx1, we spread the reward into the actions. - y_hat = self.action_model(s).gather(1, a) - - masking = torch.sub(1.0, d).unsqueeze(1) - y = self.discount * self.target_net(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat) - - loss = self.loss_func(y, y_hat) - self.loss = loss.cpu().detach() - - if self.training: - self.opt.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_norm_(self.action_model.parameters(), self.gradient_clipping_norm) - for param in self.action_model.parameters(): - param.grad.data.clamp_(-1, 1) - self.opt.step() - - with torch.no_grad(): - post_info = {'td_error': (y - y_hat).numpy()} - return post_info + return self.discount * self.target_net(s_prime).max(axis=1)[0].unsqueeze(1) * masking + r.expand_as(y_hat) class DoubleDQN(FixedTargetDQN): def __init__(self, data: MDPDataBunch, memory=None, copy_over_frequency=3, **kwargs): - """ + r""" Double DQN training. References: @@ -279,57 +254,20 @@ def __init__(self, data: MDPDataBunch, memory=None, copy_over_frequency=3, **kwa super().__init__(data=data, memory=memory, copy_over_frequency=copy_over_frequency, **kwargs) self.name = 'DDQN' - def optimize(self): - """Uses ER to optimize the Q-net. - - Uses the equation: - - .. math:: - Q^{*}(s, a) = \mathbb{E}_{s'∼ \Big\epsilon} \Big[r + \lambda \displaystyle\max_{}(Q^{*}(s' , \ - argmax_{a'}(Q(s', \Theta)), \Theta^{-})) \;|\; s, a \Big] - - Returns (dict): Optimization information - """ - if len(self.memory) > self.batch_size: - # Perhaps have memory as another itemlist? Should investigate. - sampled = self.memory.sample(self.batch_size) - with torch.no_grad(): - r = torch.from_numpy(np.array([item.reward for item in sampled])).float() - s_prime = torch.from_numpy(np.array([item.result_state for item in sampled])).float() - s = torch.from_numpy(np.array([item.current_state for item in sampled])).float() - a = torch.from_numpy(np.array([item.actions for item in sampled])).long() - - # Traditional `maze-random-5x5-v0` with have a model output a Nx4 output. - # since r is just Nx1, we spread the reward into the actions. - y_hat = self.action_model(s).gather(1, a) - y = self.discount * self.target_net(s_prime).gather(1, self.action_model(s_prime).argmax(axis=1).unsqueeze( - 1)) + r.expand_as(y_hat) - - loss = self.loss_func(y, y_hat) - self.loss = loss - - if self.training: - self.opt.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_norm_(self.action_model.parameters(), self.gradient_clipping_norm) - for param in self.action_model.parameters(): - param.grad.data.clamp_(-1, 1) - self.opt.step() - - with torch.no_grad(): - post_info = {'td_error': (y - y_hat).numpy()} - return post_info + def calc_y(self, s_prime, masking, r, y_hat): + return self.discount * self.target_net(s_prime).gather(1, self.action_model(s_prime).argmax(axis=1).unsqueeze( + 1)) * masking + r.expand_as(y_hat) class DuelingDQNModule(nn.Module): - def __init__(self, a_s, stream_input_size): + def __init__(self, action, stream_input_size): super().__init__() - self.val = create_nn_model([stream_input_size], (0, 1), (stream_input_size, 0)) - self.adv = create_nn_model([stream_input_size], a_s[0], (stream_input_size, 0)) + self.val = nn.Linear(stream_input_size, 1) + self.adv = nn.Linear(stream_input_size, action.n_possible_values) def forward(self, x): - """Splits the base neural net output into 2 streams to evaluate the advantage and values of the state space and + r"""Splits the base neural net output into 2 streams to evaluate the advantage and values of the s space and corresponding actions. .. math:: @@ -341,17 +279,14 @@ def forward(self, x): Returns: """ - val = self.val(x) - adv = self.adv(x) - + val, adv = self.val(x), self.adv(x) x = val.expand_as(adv) + (adv - adv.mean()).squeeze(0) return x class DuelingDQN(FixedTargetDQN): def __init__(self, data: MDPDataBunch, memory=None, **kwargs): - """Replaces the basic action model with a DuelingDQNModule which splits the basic model into 2 streams. - + r"""Replaces the basic action model with a DuelingDQNModule which splits the basic model into 2 streams. References: [1] Wang, Ziyu, et al. "Dueling network architectures for deep reinforcement learning." @@ -364,16 +299,14 @@ def __init__(self, data: MDPDataBunch, memory=None, **kwargs): self.name = 'Dueling DQN' def initialize_action_model(self, layers, data): - base = create_nn_model(layers, *data.get_action_state_size())[:-1] - a_s = data.get_action_state_size() - stream_input_size = base[-2].out_features - dueling_head = DuelingDQNModule(a_s=a_s, stream_input_size=stream_input_size) + base = super().initialize_action_model(layers, data)[:-2] + dueling_head = DuelingDQNModule(action=data.action, stream_input_size=base[-1].out_features) return nn.Sequential(base, dueling_head) class DoubleDuelingDQN(DoubleDQN, DuelingDQN): def __init__(self, data: MDPDataBunch, memory=None, **kwargs): - """ + r""" Combines both Dueling DQN and DDQN. Args: diff --git a/fast_rl/core/Envs.py b/fast_rl/core/Envs.py index 3807856..79fbe07 100644 --- a/fast_rl/core/Envs.py +++ b/fast_rl/core/Envs.py @@ -22,15 +22,16 @@ def ban(envs: list): banned_envs = { 'Defender': 'Defender (%s) seems to load for an extremely long time. Skipping for now. Determine cause.', 'Fetch': 'Fetch (%s) envs are not ready yet.', + 'Blackjack-v0': 'Blackjack (%s) does not have a render function... Skipping', 'InvertedPendulumMuJoCoEnv': 'Mujoco Inverted Pendulum (%s) has a bug.', - 'HopperMuJoCoEnv': '(%s) Does not pass azure pipeline tests', - 'InvertedDoublePendulumMuJoCoEnv': '(%s) Does not pass azure pipeline tests', - 'HalfCheetahMuJoCoEnv': '(%s) Does not pass azure pipeline tests', - 'HumanoidMuJoCoEnv': '(%s) Does not pass azure pipeline tests', - 'Walker2DMuJoCoEnv': '(%s) Does not pass azure pipeline tests', - 'AntMuJoCoEnv': '(%s) Does not pass azure pipeline tests', - 'AtlasPyBulletEnv': 'AtlasPyBulletEnv (%s) seems to load very slowly. Skipping for now.', - 'MazeEnv': '(%s) Having a maze view issue.', + # 'HopperMuJoCoEnv': '(%s) Does not pass azure pipeline tests', + # 'InvertedDoublePendulumMuJoCoEnv': '(%s) Does not pass azure pipeline tests', + # 'HalfCheetahMuJoCoEnv': '(%s) Does not pass azure pipeline tests', + # 'HumanoidMuJoCoEnv': '(%s) Does not pass azure pipeline tests', + # 'Walker2DMuJoCoEnv': '(%s) Does not pass azure pipeline tests', + # 'AntMuJoCoEnv': '(%s) Does not pass azure pipeline tests', + # 'AtlasPyBulletEnv': 'AtlasPyBulletEnv (%s) seems to load very slowly. Skipping for now.', + # 'MazeEnv': '(%s) Having a maze view issue.', } envs = np.array(envs)[list(map(partial(Envs._error_out, ban_list=banned_envs), envs))] @@ -47,4 +48,4 @@ def get_all_envs(key=None, exclude_key=None): def get_all_latest_envs(key=None, exclude_key=None): all_envs = Envs.get_all_envs(key, exclude_key) roots = list(set(map(lambda x: str(x).split('-v')[0], all_envs))) - return list(set([sorted([env for env in all_envs if env.__contains__(root)])[-1] for root in roots])) + return list(set([sorted([env for env in all_envs if env.split('-v')[0] == root])[-1] for root in roots])) diff --git a/fast_rl/core/Interpreter.py b/fast_rl/core/Interpreter.py index 327716c..59e7866 100644 --- a/fast_rl/core/Interpreter.py +++ b/fast_rl/core/Interpreter.py @@ -17,7 +17,7 @@ from torch import nn from fast_rl.core import Learner -from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSlice, FEED_TYPE_IMAGE +from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSliceAlpha, FEED_TYPE_IMAGE class AgentInterpretationAlpha(Interpretation): @@ -55,11 +55,11 @@ def normalize(self, item: np.array): def top_losses(self, k: int = None, largest=True): raise NotImplementedError - def reward_heatmap(self, episode_slices: List[MarkovDecisionProcessSlice], action=None): + def reward_heatmap(self, episode_slices: List[MarkovDecisionProcessSliceAlpha], action=None): """ Takes a state_space and uses the agent to heat map rewards over the space. - We first need to determine if the state space is discrete or continuous. + We first need to determine if the s space is discrete or discrete. Args: state_space: @@ -86,7 +86,7 @@ def reward_heatmap(self, episode_slices: List[MarkovDecisionProcessSlice], actio def plot_heatmapped_episode(self, episode, fig_size=(13, 5), action_index=None, return_heat_maps=False): """ - Generates plots of heatmapped state spaces for analyzing reward distribution. + Generates plots of heatmapped s spaces for analyzing reward distribution. Currently only makes sense for grid based envs. Will be expecting gym_maze environments that are discrete. @@ -94,9 +94,9 @@ def plot_heatmapped_episode(self, episode, fig_size=(13, 5), action_index=None, """ if not str(self.ds.env.spec).__contains__('maze'): - raise NotImplementedError('Currently only supports gym_maze envs that have discrete state spaces') + raise NotImplementedError('Currently only supports gym_maze envs that have discrete s spaces') if not isinstance(self.ds.state_size, Box): - raise NotImplementedError('Currently only supports Box based state spaces with 2 dimensions') + raise NotImplementedError('Currently only supports Box based s spaces with 2 dimensions') items = self._get_items() heat_maps = [] @@ -141,7 +141,7 @@ def plot_heatmapped_episode(self, episode, fig_size=(13, 5), action_index=None, if return_heat_maps: return heat_maps def plot_episode(self, episode): - items = self._get_items(False) # type: List[MarkovDecisionProcessSlice] + items = self._get_items(False) # type: List[MarkovDecisionProcessSliceAlpha] episode_counter = 0 # For each episode @@ -213,7 +213,7 @@ def plot_agent_accuracy_density(self, episode_num=None): Returns: """ - items = self._get_items(False) # type: List[MarkovDecisionProcessSlice] + items = self._get_items(False) # type: List[MarkovDecisionProcessSliceAlpha] x, y = self.get_agent_accuracy_density(items, episode_num) fig = plt.figure(figsize=(8, 8)) @@ -250,7 +250,7 @@ def plot_q_density(self, episode_num=None): Returns: """ - items = self._get_items(False) # type: List[MarkovDecisionProcessSlice] + items = self._get_items(False) # type: List[MarkovDecisionProcessSliceAlpha] x, y = self.get_q_density(items, episode_num) # Define the borders diff --git a/fast_rl/core/Learner.py b/fast_rl/core/Learner.py deleted file mode 100644 index 9d58097..0000000 --- a/fast_rl/core/Learner.py +++ /dev/null @@ -1,37 +0,0 @@ -from functools import partial - -from fastai.basic_train import Recorder, defaults, Learner, listify, ifnone - -from fast_rl.agents.BaseAgent import BaseAgent -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, MDPMemoryManager -from fast_rl.core.agent_core import fit -from fast_rl.core.metrics import EpsilonMetric - - -class AgentLearner(object): - silent: bool = None - - def __init__(self, data: MDPDataBunch, model: BaseAgent, mem_strategy='k_partitions_both', k=1, - metrics=None): - """ - Will very soon subclass the fastai learner class. For now we need to understand the important functional - requirements needed in a learner. - - """ - self.model = model - self.data = data - self.metrics = [] - self.opt = self.model.opt - if self.silent is None: self.silent = defaults.silent - self.add_time: bool = True - self.recorder = Recorder(learn=self, add_time=self.add_time, silent=self.silent) - self.recorder.no_val = self.data.empty_val - self.callbacks = ifnone(metrics, []) - self.callbacks += [partial(MDPMemoryManager, mem_strategy=mem_strategy, k=k)] - self.callbacks = [self.recorder] + [f(self) for f in self.callbacks] + [f(learn=self) for f in self.model.learner_callbacks] - - def predict(self, element): - return self.model.pick_action(element) - - def fit(self, epochs): - fit(epochs, self, self.callbacks, metrics=self.metrics) diff --git a/fast_rl/core/MarkovDecisionProcess.py b/fast_rl/core/MarkovDecisionProcess.py index 300aa58..3d7ce4f 100644 --- a/fast_rl/core/MarkovDecisionProcess.py +++ b/fast_rl/core/MarkovDecisionProcess.py @@ -1,8 +1,8 @@ -from numbers import Integral - import gym -from fastai.basic_train import LearnerCallback -from gym.spaces import Discrete, Box +from fastai.basic_train import LearnerCallback, DatasetType +from gym.spaces import Discrete, Box, MultiDiscrete + +from fast_rl.util.exceptions import MaxEpisodeStepsMissingError try: # noinspection PyUnresolvedReferences @@ -17,231 +17,393 @@ print(f'Can\'t import one of these: {e}') from fastai.core import * -# noinspection PyProtectedMember -from fastai.data_block import ItemList, Tensor, Dataset, DataBunch, data_collate, DataLoader, PreProcessors +from fastai.data_block import ItemList, Tensor, Dataset, DataBunch, DataLoader from fastai.imports import torch -from fastai.vision import Image -from gym import error -from gym.envs.algorithmic.algorithmic_env import AlgorithmicEnv -from gym.envs.toy_text import discrete -from gym.wrappers import TimeLimit from datetime import datetime import pickle -from fast_rl.util.misc import b_colors +from fast_rl.util.misc import b_colors, list_in_str FEED_TYPE_IMAGE = 0 FEED_TYPE_STATE = 1 -class MDPMemoryManager(LearnerCallback): - def __init__(self, learn, mem_strategy, k, max_episodes=None, episode=0, iteration=0): +@dataclass +class Bounds(object): + r""" + Handles bounds for 1 dimensional spaces. + + between (gym.Space): A convenience variable for determining the min and max bounds + + discrete (bool): Whether the Bounds are discrete or not. Important for N possible values calc. + + min (list): Correlated min for a given dimension + + max (list): Correlated max for a given dimension + """ + between: Any = None + discrete: bool = False + min: list = None + max: list = None + + def __len__(self): + r""" + Returns the number of dimensions the bounds have. + + `self.min`'s length is returned because `self.max` and `self.min` as validated to have the same length. + """ + return len(self.min) + + @property + def n_possible_values(self): """ - Handles different ways of memory management: - - (k_partitions_best) keep partition best episodes - - (k_partitions_worst) keep partition worst episodes - - (k_partitions_both) keep partition worst best episodes - - (k_top_best) keep high fidelity k top episodes - - (k_top_worst) keep k top worst - - (k_top_both) keep k top worst and best - - (none) keep none, only load into memory (always keep first) - - (all): keep all steps will be kept (most memory inefficient) + Returns the maximum number of values that can be taken. + + This is important for doing embeddings. + """ + if not self.discrete: return np.inf + else: return np.prod(np.subtract(self.max, self.min)) + + def __post_init__(self): + """Sets min and max fields then validates them.""" + self.min, self.max = ifnone(self.min, []), ifnone(self.max, []) + # If a tuple has been passed, break it into correlated min max variables. + if self.between is not None: + for b in (self.between if isinstance(self.between, gym.spaces.Tuple) else listify(self.between)): + if isinstance(b, (int, np.int64, np.int, float, np.float)): + self.min, self.max = self.min + [0], self.max + [b] + self.discrete = isinstance(b, (int, np.int, np.int64)) + elif isinstance(b, Discrete): + self.min, self.max, self.discrete = self.min + [0], self.max + [b.n], True + elif isinstance(b, MultiDiscrete): + self.min, self.max, self.discrete = self.min + [0] * sum(b.nvec.shape), self.max + b.nvec, True + elif isinstance(b, Box): + self.min, self.max = self.min + list(b.low), self.max + list(b.high) + self.discrete = any([b.dtype in (int, np.int, np.int64)]) + else: + raise ValueError(f'Tuple not understood {self.between}') + + if len(self.min) != len(self.max): + raise ValueError(f'Min Max do not match min {len(self.min)} max {len(self.max)}') + if len(self.min) == 0: raise ValueError(f'Min and Max are 0') + + +@dataclass +class Action(object): + r""" + Handles actions, action space, and value verification. + + An important difference between taken_action and raw_action is that the raw action is the immediate + raw output from the model before any argmax processing. + + taken_action (np.array): Expected to always to be numpy arrays with shape (-1, 1). This is the action that + is to be input into the env `step` function. + + raw_action (np.array): Expected to always to be numpy arrays with shape (-1, 1). This is the raw model + output such as neural net final layer output. Can be None. + + action_space (gym.Space): Used for estimating the max number of values. This is important for embeddings. + + bounds (tuple): Maximum and minimum values for each action dimension. + + n_possible_values (int): An integer or inf value indicating the total number of possible actions there are. + """ + taken_action: torch.tensor + action_space: gym.Space + raw_action: torch.tensor = None + bounds: Bounds = None + n_possible_values: int = 0 + + def __post_init__(self): + # Determine bounds + self.bounds = Bounds(self.action_space) + self.n_possible_values = self.bounds.n_possible_values + + # Fix shapes + self.taken_action = torch.tensor(data=self.taken_action).reshape(1, -1) + if self.raw_action is not None: self.raw_action = torch.tensor(data=self.raw_action).reshape(1, -1) + + def get_single_action(self): + """ OpenAI envs do not like 1x1 arrays when they are expecting scalars, so we need to unwrap them. """ + a = self.taken_action.detach().numpy()[0] + if len(self.bounds) == 1: a = a[0] + if self.bounds.discrete and len(self.bounds) == 1: return int(a) + elif self.bounds.discrete and len(self.bounds) != 1: return a.astype(int) + elif not self.bounds.discrete and len(self.bounds) == 1: return [float(a)] + elif not self.bounds.discrete and len(self.bounds) != 1: return a.reshape(-1,).astype(float) + raise ValueError(f'This should not have crashed.') + + def set_single_action(self, action: np.array): + if np.isscalar(action): + self.taken_action = torch.tensor(data=action).reshape(1, -1) + elif len(action.shape) == 1: + self.taken_action = torch.tensor(data=action).reshape(1, -1) + elif len(action.shape) == 3 and action.shape[0] == 1: + self.taken_action = torch.tensor(data=action)[0] + self.taken_action = self.taken_action.int() if self.bounds.discrete else self.taken_action.float() + + +@dataclass +class State(object): + r""" + Handles states, both their main and alternate formats. + + s (np.array): State space acquired from `env.reset` `s_prime` or `env.render`. + + s_prime (np.array): State space acquired from `env.step` or `env.render`. + + alt_s (np.array): Alternate State space acquired from `env.reset` `s_prime` or `env.render`. Should be an image. + + alt_s_prime (np.array): Alternate State space acquired from `env.step` or `env.render`. Should be an image. + + mode (int): Should be either FEED_TYPE_IMAGE or FEED_TYPE_STATE + + observation_space (gym.Space): Used for estimating the max number of values. This is important for embeddings. + + bounds (Bounds): Maximum and minimum values for each state dimension. + + n_possible_values (int): An integer or inf value indicating the total number of possible actions there are. + """ + s: torch.tensor + s_prime: torch.tensor + alt_s: Union[torch.tensor, np.array] + alt_s_prime: Union[torch.tensor, np.array] + observation_space: gym.Space + mode: int = FEED_TYPE_STATE + bounds: Bounds = None + n_possible_values: int = 0 + + def __str__(self): + out = copy(self.__dict__) + for key in out: + if out[key] is not None and (key == 's' or list_in_str(key, ['_s', 's_'])): out[key] = out[key].shape + + return f'State: ' + ', '.join([str(i) for i in out.items()]) + + def _fix_field(self, input_field): + if input_field is None: return None + input_field = copy(input_field) + + if type(input_field) is str: input_field = np.array(input_field) + elif type(input_field) is tuple: + dtype = int if self.bounds.discrete else float + input_field = torch.tensor(data=np.array(input_field).reshape(1, -1).astype(dtype)) + elif np.isscalar(input_field): input_field = torch.tensor(data=input_field) + elif type(input_field) is torch.Tensor: input_field = input_field.clone().detach() + else: input_field = torch.from_numpy(input_field) + + # If a non-image state missing the batch dim + if len(input_field.shape) <= 1: return input_field.reshape(1, -1) + # If a non-image 2+d state missing the batch dim + elif input_field.shape[0] != 1 and len(input_field.shape) != 3: return input_field.reshape(1, -1) + # If an image state and missing the batch dim + elif input_field.shape[0] != 1 and len(input_field.shape) == 3: return input_field.unsqueeze(0) + # If an image with 4 dims (b, w, h, c), return safely + elif input_field.shape[0] == 1 and len(input_field.shape) == 4: return input_field + # If not an image, but has 2 dims (b, d), return safely + elif input_field.shape[0] == 1 and len(input_field.shape) == 2: return input_field + raise ValueError(f'Input has shape {input_field} for mode {self.mode}. This is unexpected.') + + def __post_init__(self): + if self.mode not in [FEED_TYPE_IMAGE, FEED_TYPE_STATE]: + ValueError(f'Mode invalid {self.mode} not valid feed type') + # We want to swap the state variables if the alt is the image state + if self.mode == FEED_TYPE_IMAGE and len(self.alt_s.shape) > 2 and len(self.s.shape) != 3: + self.observation_space = gym.spaces.Box(0, 255, self.alt_s.shape, dtype=np.int64) + self.alt_s, self.alt_s_prime, self.s, self.s_prime = self.s, self.s_prime, self.alt_s, self.alt_s_prime + # Determine bounds + self.bounds = Bounds(self.observation_space) + self.n_possible_values = self.bounds.n_possible_values + # Fix Shapes + self.s, self.s_prime = self._fix_field(self.s), self._fix_field(self.s_prime) + self.alt_s, self.alt_s_prime = self._fix_field(self.alt_s), self._fix_field(self.alt_s_prime) + + +@dataclass +class MDPStep(object): + r""" + Contains all the variables to represent a Markov Decision Process step. + + action (Action): + + state (State): + + done (bool): + + reward (float): + + episode (int): + + step (int): + + """ + action: Action + state: State + done: torch.tensor + reward: torch.tensor + episode: int + step: int + + def __post_init__(self): + self.action = deepcopy(self.action) + self.state = deepcopy(self.state) + self.reward = torch.tensor(data=self.reward).reshape(1, -1).float() + self.done = torch.tensor(data=self.done).reshape(1, -1).float() + + def __str__(self): return ', '.join([str(self.__dict__[el]) for el in self.__dict__]) + + def clean(self): + r""" Removes fields that are generally unimportant (purely debugging) """ + self.state.alt_s_prime = None + self.state.alt_s = None + self.state.observation_space = None + self.state.bounds = None + self.state.n_possible_values = None + self.action.raw_action = None + self.action.action_space = None + self.action.bounds = None + self.action.n_possible_values = None + + @property + def data(self): return self.state.s_prime[0], self.state.alt_s_prime[0] + @property + def obj(self): return self.__dict__ + @property + def s(self): return self.state.s + @property + def s_prime(self): return self.state.s_prime + @property + def alt_s_prime(self): return self.state.alt_s_prime + @property + def a(self): return self.action.taken_action + @property + def d(self): + return bool(self.done) + + +class MDPCallback(LearnerCallback): + def __init__(self, learn): + """ + Handles action assignment, episode naming. Args: learn: - mem_strategy: - k: - max_episodes: - episode: - iteration: """ super().__init__(learn) - self.mem_strategy = mem_strategy + self.train_ds: MDPDataset = learn.data.train_ds + self.valid_ds: MDPDataset = None if learn.data.empty_val else learn.data.valid_ds + + def on_batch_begin(self, last_input, last_target, train, **kwargs: Any): + """ Set the Action of a dataset, determine if still warming up. """ + a = self.learn.predict(last_input) + if train: self.train_ds.action = Action(taken_action=a, action_space=self.train_ds.action.action_space) + else: self.valid_ds.action = Action(taken_action=a, action_space=self.train_ds.action.action_space) + self.train_ds.is_warming_up = self.learn.model.warming_up + if self.valid_ds is not None: self.valid_ds.is_warming_up = self.learn.model.warming_up + if not self.learn.model.warming_up and self.learn.loss_func is None: + self.learn.init_loss_func() + return {'skip_bwd': True} + + def on_backward_end(self, **kwargs: Any): + return {'skip_step': True} + + def on_step_end(self, **kwargs: Any): + return {'skip_zero': True} + + def on_epoch_end(self, last_metrics, epoch, **kwargs: Any) -> None: + """ Updates the most recent episode number in both datasets. """ + self.train_ds.x.set_recent_run_episode(epoch) + self.train_ds.episode = epoch + if last_metrics[0] is not None: + self.valid_ds.x.set_recent_run_episode(epoch) + self.valid_ds.episode = epoch + + +class MDPMemoryManager(LearnerCallback): + def __init__(self, learn, strategy, k=1): + super().__init__(learn) + self.strategy = strategy self.k = k - self.ds = None - self.max_episodes = max_episodes - self.episode = episode - self.last_episode = episode - self.iteration = iteration - self._persist = max_episodes is not None - - # Private Util Fields - self._train_episodes_keep = {} - self._valid_episodes_keep = {} - self._current_partition = 0 - - self._lower = 0 - self._upper = 0 - - def _comp_less(self, key, d, episode): return d[key] < self.data.x.info[episode] - def _comp_greater(self, key, d, episode): return d[key] > self.data.x.info[episode] - def _dict_access(self, key, dictionary): return {key: dictionary[key]} - - def _k_top_best(self, episode, episodes_keep): - best = dict(filter(partial(self._comp_greater, d=episodes_keep, episode=episode), episodes_keep)) - return list(dict(sorted(best.items(), reverse=True)).keys()) - - def _k_top_worst(self, episode, episodes_keep): - worst = dict(filter(partial(self._comp_less, d=episodes_keep, episode=episode), episodes_keep)) - return list(dict(sorted(worst.items())).keys()) - - def _k_top_both(self, episode, episodes_keep): - best, worst = self._k_top_best(episode, episodes_keep), self._k_top_worst(episode, episodes_keep) - if best: return best - return worst - - def _k_partitions_best(self, episode, episodes_keep): - # Filter episodes by their partition - if episode >= self._upper: self.lower, self._upper = self._upper, self._upper + self.max_episodes // self.k - filtered_episodes = {ep: episodes_keep[ep] for ep in episodes_keep if self._lower <= ep < self._upper} - best = list(filter(partial(self._comp_greater, d=filtered_episodes, episode=episode), filtered_episodes)) - best = {k: filtered_episodes[k] for k in filtered_episodes if k in best} - return list(dict(sorted(best.items(), reverse=True)).keys()) - - def _k_partitions_worst(self, episode, episodes_keep): - # Filter episodes by their partition - if episode >= self._upper: self.lower, self._upper = self._upper, self._upper + self.max_episodes // self.k - filtered_episodes = {ep: episodes_keep[ep] for ep in episodes_keep if self._lower <= ep < self._upper} - worst = list(filter(partial(self._comp_less, d=filtered_episodes, episode=episode), filtered_episodes)) - worst = {k: filtered_episodes[k] for k in filtered_episodes if k in worst} - return list(dict(sorted(worst.items())).keys()) - - def _k_partitions_both(self, episode, episodes_keep): - best, worst = self._k_partitions_best(episode, episodes_keep), self._k_partitions_worst(episode, episodes_keep) - if best: return best - return worst - - def on_loss_begin(self, **kwargs: Any): - self.iteration += 1 - - def manage_memory(self, episodes_keep, episode): - if episode not in self.ds.x.info: return - episodes_keep[episode] = self.ds.x.info[episode] - if len(episodes_keep) < self.k: return - # We operate of the second to most recent episode so the agent still has an opportunity to use the full episode - episode = episode - 1 - if episode not in self.ds.x.info: return + self._strategy_fn_dict = { + 'k_partitions_top': self.k_top + } - try: - # If the episodes to keep is full, then we have to decide which ones to remove - if self.mem_strategy == 'k_top_best': del_ep = self._k_top_best(episode, episodes_keep) - if self.mem_strategy == 'k_top_worst': del_ep = self._k_top_worst(episode, episodes_keep) - if self.mem_strategy == 'k_top_both': del_ep = self._k_top_both(episode, episodes_keep) - if self.mem_strategy == 'k_partitions_best': del_ep = self._k_partitions_best(episode, episodes_keep) - if self.mem_strategy == 'k_partitions_worst': del_ep = self.k_partitions_worst(episode, episodes_keep) - if self.mem_strategy == 'k_partitions_both': del_ep = self._k_partitions_both(episode, episodes_keep) - if self.mem_strategy == 'all': del_ep = [-1] - - # If there are episodes to delete, then set them as the main episode to delete - if len(del_ep) != 0: - # episodes_keep[episode] = self.ds.x.info[episode] - del episodes_keep[del_ep[0]] - episode = del_ep[0] - self.ds.x.clean(episode) - self.ds.x.info = episodes_keep - - except KeyError as e: - pass - except TypeError as e: - pass - - def on_train_begin(self, n_epochs, **kwargs: Any): - self.max_episodes = n_epochs if not self._persist else self.max_episodes - self._upper = self.max_episodes // self.k - - def on_epoch_begin(self, epoch, **kwargs: Any): - self.episode = epoch if not self._persist else self.episode + 1 - self.iteration = 0 - - def on_epoch_end(self, **kwargs: Any) -> None: - if self.learn.data.train_ds is not None: - self.ds = self.learn.data.train_ds - self.manage_memory(self._train_episodes_keep, self.episode) - if self.learn.data.valid_dl is not None: - self.ds = self.learn.data.valid_ds - self.manage_memory(self._valid_episodes_keep, self.episode) + def _comp_less(self, key, info, episode): + return info[key] < info[episode] + def _comp_greater(self, key, info, episode): + return info[key] > info[episode] -class MDPDataset(Dataset): - def __init__(self, env: gym.Env, feed_type=FEED_TYPE_STATE, render='rgb_array', max_steps=None, bs=8, - x=None, memory_management_strategy='k_partitions_best', k=1, skip=False, embeddable=False): - """ - Handles the running and loading of environments, as well as storing episode steps. + def k_top(self, info: Dict[int, List[Tuple[float, bool]]]): + # If the episode -1 is defined, then clean it, this is just placeholder data + if -1 in info and not info[-1][1]: return [-1] + # If the number of not-clean episodes is less than k, don't do anything + if len([k for k in info if not info[k][1]]) <= self.k: return None + # Collect k top episodes, then clean the rest + k_top = [] + for episode in [i for i in info if i != -1]: + # If the episode is greater than all of the other episodes that are not already in k_top + compared = [info[episode][0] > info[k][0] for k in info if k not in k_top and k != -1 and k != episode] + if all(compared) and len(compared) != 0 and len(k_top) < self.k: + k_top.append(episode) + + return list(set([k for k in info if not info[k][1]]) - set(k_top)) - mem_strategy has a few settings. This is for inference on the existing data: - - (k_partitions_best) keep partition best episodes - - (k_partitions_both) keep partition worst best episodes - - (k_top_best) keep high fidelity k top episodes - - (k_top_worst) keep k top worst - - (k_top_both) keep k top worst and best - - (non) keep non, only load into memory (always keep first) - - (all): keep all steps will be kept (most memory inefficient) + + def on_epoch_end(self, **kwargs: Any): + for ds_type in [DatasetType.Train] if self.learn.data.empty_val else [DatasetType.Train, DatasetType.Valid]: + ds: MDPDataset = self.learn.dl(ds_type).dataset + episodes = self._strategy_fn_dict[self.strategy](ds.x.info) + if episodes is not None: + for e in episodes: ds.x.clean(e) + + +class MDPDataset(Dataset): + def __init__(self, env: gym.Env, memory_manager, bs, render='rgb_array', feed_type=FEED_TYPE_STATE, max_steps=None): + r""" + Handles env execution and ItemList building. Args: - env: - feed_type: - render: - max_steps: - bs: - x: - memory_management_strategy: Reference above. Tests `self` how to keep memory usage down. - k: Related to the `mem_strategy` and is either k quartiles and k most. - save_every: Skip adding to datasets. + env: OpenAI environment to execute. + memory_manager: Handles how the list size will be reduced sch as removing image data. + bs: Size of a single batch for models and the dataset to use. """ - self.k = k - self.skip = False - if skip: - self.skip = skip - return - self.mem_strat = memory_management_strategy - self.bs = bs - # noinspection PyUnresolvedReferences,PyProtectedMember - env._max_episode_steps = env.spec.max_episode_steps if not hasattr(env, '_max_episode_steps') else env._max_episode_steps - self.max_steps = env._max_episode_steps if max_steps is None else max_steps + self.env = env self.render = render self.feed_type = feed_type - self.env = env - # MDP specific values - self.actions = self.get_random_action(env.action_space) - if isinstance(env.action_space, Box): self.raw_action = np.random.randn((env.action_space.shape[0])) - elif isinstance(env.action_space, Discrete): self.raw_action = np.random.randn((env.action_space.n)) - else: self.raw_action = self.get_random_action(env.action_space) - - self.is_done = True - self.current_state = None - self.current_image = None - - self.embeddable = embeddable - - self.env_specific_handle() - self.counter = -1 - self.episode = 0 - self.x = MarkovDecisionProcessList() if x is None else x # self.new(0) - self.item = None - self.episodes_to_keep = {} + self.bs = bs + self._max_steps = max_steps + self.action = Action(taken_action=self.env.action_space.sample(), action_space=self.env.action_space) + self.state = None + self.s_prime, self.alt_s_prime = None, None + self.callback = [MDPCallback, memory_manager] + # Tracking fields + self.episode = -1 + self.counter = 0 + self.is_warming_up = True + + # FastAI fields + self.x = MDPList([]) + self.item: Union[MDPStep, None] = None + self.new(None) + + def aug_steps(self, steps): + if self.is_warming_up and steps < self.bs: return self.bs + return steps @property - def state_size(self): - if self.feed_type == FEED_TYPE_STATE: - return self.env.observation_space - else: - return gym.spaces.Box(0, 255, shape=self.env.render('rgb_array').shape) - - def __del__(self): - self.env.close() + def max_steps(self): + if self._max_steps is not None: return self._max_steps + if hasattr(self.env, '_max_episode_steps'): return getattr(self.env, '_max_episode_steps') + if self.env.spec.max_episode_steps is not None: return self.env.spec.max_episode_steps - def env_specific_handle(self): - if isinstance(self.env, TimeLimit) and isinstance(self.env.unwrapped, AlgorithmicEnv): - self.render = 'ansi' if self.render == 'rgb_array' else self.render - if isinstance(self.env, TimeLimit) and isinstance(self.env.unwrapped, discrete.DiscreteEnv): - self.render = 'ansi' if self.render == 'rgb_array' else self.render + msg = f'Env {self.env.spec.id} does not have max episode steps. ' + msg += ' Either pass the max steps as a param, or catch this exception then pass a default max step param.' + raise MaxEpisodeStepsMissingError(msg) - def get_random_action(self, action_space=None): - action_space = action_space if action_space is not None else self.env.action_space - return action_space.sample() - - def _get_image(self): - # Specifically for the stupid blackjack-v0 env >:( + @property + def image(self): + r""" Needed because of blackjack-v0 env >:( """ try: current_image = self.env.render('rgb_array') if self.render == 'human': self.env.render(self.render) @@ -250,66 +412,61 @@ def _get_image(self): current_image = None return current_image - def new(self, _): + def __del__(self): + self.env.close() + + def __len__(self): + return self.aug_steps(self.max_steps) + + def stage_1_env_reset(self): + r""" + Handles environment resetting and dataset batch termination. + + We are interested in the entire dataset ending when an item is done. + + Returns: The state space and the image after a reset. + """ - New element is a query of the environment + if self.counter != 0 and self.item.d: + self.counter = 0 + if not self.is_warming_up: raise StopIteration + if self.item is None or self.item.d: return self.env.reset(), self.image + return self.s_prime, self.alt_s_prime - Args: - _: + def stage_2_env_step(self) -> Tuple[np.array, float, bool, None, np.array]: + r""" + Handles taking a step in the environment. - Returns: + We want to cancel the env early if we are at our max step amount. + Returns: The state, reward, whether the episode is done, and the image. """ - # First Phase: decide on episode reset. Collect current state and image representations. - if self.is_done or self.counter >= self.max_steps - 3: - self.current_state, reward, self.is_done, info = self.env.reset(), 0, False, {} - if type(self.current_state) is not list and type(self.current_state) is not np.ndarray: self.current_state = [self.current_state] - # Specifically for the stupid blackjack-v0 env >:( - self.current_image = self._get_image() - - result_state, reward, self.is_done, info = self.env.step(self.actions) - if self.is_done and self.counter == -1: - self.current_state, reward, self.is_done, info = self.env.reset(), 0, False, {} - - if type(result_state) is not list and type(result_state) is not np.ndarray: result_state = [result_state] - result_image = self._get_image() - self.counter += 1 + s_prime, reward, done, _ = self.env.step(self.action.get_single_action()) + # If we are at the max steps limit but the env is not done, we need to force an env end. + # However, we need the loop to iterate +1 time for allowing the stage_1_env_reset + if len(self) - 2 == self.counter: done = True + return s_prime, reward, done, _, self.image - # Second Phase: Generate MDP slice - result_state = result_image.transpose(2, 0, 1) if self.feed_type == FEED_TYPE_IMAGE and result_image is not None else result_state - current_state = self.current_image.transpose(2, 0, 1) if self.feed_type == FEED_TYPE_IMAGE and self.current_image is not None else self.current_state - alternate_state = result_state if self.feed_type == FEED_TYPE_IMAGE or result_state is None else result_image - items = MarkovDecisionProcessSlice(state=np.copy(current_state), state_prime=np.copy(result_state), - alt_state=np.copy(alternate_state), action=np.copy(self.actions), - reward=reward, done=copy(self.is_done), feed_type=copy(self.feed_type), - episode=copy(self.episode), raw_action=self.raw_action) - self.current_state = copy(result_state) - self.current_image = copy(result_image) + def new(self, _): + s, alt_s = self.stage_1_env_reset() + self.s_prime, reward, done, _, self.alt_s_prime = self.stage_2_env_step() + # If both the current item and the done are both true, then we need to retry the env + if self.item is not None and self.item.d and done: return self.new() - list_item = MarkovDecisionProcessList([items]) - return list_item + self.state = State(s, self.s_prime, alt_s, self.alt_s_prime, self.env.observation_space, self.feed_type) + self.item = MDPStep(self.action, self.state, done, reward, self.episode, self.counter) + self.counter += 1 - def __len__(self): - return self.max_steps + return MDPList([self.item]) - def __getitem__(self, _) -> 'MDPDataset': + def __getitem__(self, _): item = self.new(_) - if (self.x and self.is_done and self.counter != -1) or \ - (self.counter >= self.max_steps - 2): - self.x.add(item) - self.counter = -1 - self.episode += 1 - raise StopIteration - self.x.add(item) - # x = self.x[idxs] # Perhaps have this as an option? - x = self.x[-1] - return x.copy() if isinstance(x, np.ndarray) else x + return self.x[-1] def to_csv(self, root_path, name): - df = self.x.to_df() # type: pd.DateFrame() if not os.path.exists(root_path): os.makedirs(root_path) - df.to_csv(root_path / (name + '.csv'), index=False) + self.x.to_df().to_csv(root_path / (name + '.csv'), index=False) def to_pickle(self, root_path, name): if not os.path.exists(root_path): os.makedirs(root_path) @@ -318,149 +475,71 @@ def to_pickle(self, root_path, name): class MDPDataBunch(DataBunch): - def _get_sizes_and_possible_values(self, item): - if isinstance(item, Discrete) and len(item.shape) != 0: return item.n, item.n - if isinstance(item, Discrete) and len(item.shape) == 0: return 1, item.n - if isinstance(item, Box) and (item.dtype == int or item.dtype == np.uint8): - return item.shape if len(item.shape) > 1 else item.shape[0], np.prod(item.high) - if isinstance(item, Box) and item.dtype == np.float32: - return item.shape if len(item.shape) > 1 else item.shape[0], np.inf - - # noinspection PyUnresolvedReferences - def get_action_state_size(self): - if self.train_ds is not None: - a_s, s_s = self.train_ds.env.action_space, self.train_ds.state_size - elif self.valid_ds is not None: - a_s, s_s = self.valid_ds.env.action_space, self.valid_ds.state_size - else: - return None - return tuple(map(self._get_sizes_and_possible_values, [a_s, s_s])) - - @classmethod - def from_env(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', test_ds: Optional[Dataset] = None, - path: PathOrStr = None, bs: int = 1, feed_type=FEED_TYPE_STATE, val_bs: int = None, - num_workers: int = 0, embed=False, memory_management_strategy='k_partitions_both', - dl_tfms: Optional[Collection[Callable]] = None, device: torch.device = None, - collate_fn: Callable = data_collate, no_check: bool = False, add_valid=True, **dl_kwargs): - """ - - Args: - env_name: - max_steps: - render: - test_ds: - path: - bs: - feed_type: - val_bs: - num_workers: - embed: - memory_management_strategy: has a few settings. This is for inference on the existing data. - - (k_partitions_best) keep partition best episodes - - (k_partitions_both) keep partition worst best episodes - - (k_top_best) keep high fidelity k top episodes - - (k_top_worst) keep k top worst - - (k_top_both) keep k top worst and best - - (non) keep non, only load into memory (always keep first) - - (all): keep all steps will be kept (most memory inefficient) - dl_tfms: - device: - collate_fn: - no_check: - add_valid: - **dl_kwargs: - - Returns: + def __del__(self): + if self.train_dl is not None: del self.train_dl.train_ds + if self.valid_dl is not None: del self.valid_dl.valid_ds - """ + @property + def state_action_sample(self) -> Union[Tuple[State, Action], None]: + ds = ifnone(self.train_ds, self.valid_ds) # type: MDPDataset + return ds.state, ds.action if ds is not None else None - try: - # train_list = MDPDataset(gym.make(env_name), max_steps=max_steps, render=render) - # valid_list = MDPDataset(gym.make(env_name), max_steps=max_steps, render=render) - env = gym.make(env_name) - val_bs = bs if val_bs is None else val_bs - train_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, embeddable=embed, - memory_management_strategy=memory_management_strategy) - if add_valid: valid_list = MDPDataset(env, max_steps=max_steps, render=render, bs=val_bs, embeddable=embed, - memory_management_strategy=memory_management_strategy) - else: valid_list = None - except error.DependencyNotInstalled as e: - print('Mujoco is not installed. Returning None') - if e.args[0].lower().__contains__('mujoco'): return None - - bs, val_bs = 1, None + @classmethod + def from_env(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', bs: int = 64, + feed_type=FEED_TYPE_STATE, num_workers: int = 0, memory_management_strategy='k_partitions_top', + split_env_init=True, device: torch.device = None, no_check: bool = False, + add_valid=True, **dl_kwargs): + + env = gym.make(env_name) + memory_manager = partial(MDPMemoryManager, strategy=memory_management_strategy) + train_list = MDPDataset(env, max_steps=max_steps, feed_type=feed_type, render=render, bs=bs, + memory_manager=memory_manager) + if add_valid: + valid_list = MDPDataset(env if split_env_init else gym.make(env_name), max_steps=max_steps, + render=render, bs=bs, feed_type=feed_type, memory_manager=memory_manager) + else: + valid_list = None path = './data/' + env_name.split('-v')[0].lower() + datetime.now().strftime('%Y%m%d%H%M%S') - - return cls.create(train_list, valid_list, num_workers=num_workers, test_ds=test_ds, path=path, bs=bs, - feed_type=feed_type, val_bs=val_bs, dl_tfms=dl_tfms, device=device, collate_fn=collate_fn, - no_check=no_check, **dl_kwargs) + return cls.create(train_list, valid_list, num_workers=num_workers, bs=1, device=device, **dl_kwargs) @classmethod - def from_pickle(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', test_ds: Optional[Dataset] = None, - path: PathOrStr = None, bs: int = 1, feed_type=FEED_TYPE_STATE, val_bs: int = None, - num_workers: int = 0, - dl_tfms: Optional[Collection[Callable]] = None, device: torch.device = None, - collate_fn: Callable = data_collate, no_check: bool = False, add_valid=True, **dl_kwargs): + def from_pickle(cls, env_name='CartPole-v1', bs: int = 1, feed_type=FEED_TYPE_STATE, render='rgb_array', + max_steps=None, add_valid=True, num_workers: int = defaults.cpus, path: PathOrStr = None, + device: torch.device = None, **dl_kwargs): if path is None: path = [_ for _ in os.listdir('./data/') if _.__contains__(env_name.split('-v')[0].lower())] if not path: raise IOError(f'There is no pickle dirs file found in ./data/ with the env name {env_name}') path = Path('./data/' + path[0]) - try: - env = gym.make(env_name) - val_bs = bs if val_bs is None else val_bs - train_ls = pickle.load(open(path / 'train.pickle', 'rb')) - train_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, x=train_ls) - - if add_valid: - valid_ls = pickle.load(open(path / 'valid.pickle', 'rb')) - valid_list = MDPDataset(env, max_steps=max_steps, render=render, bs=val_bs, x=valid_ls) - else: - valid_list = None - - except error.DependencyNotInstalled as e: - print('Mujoco is not installed. Returning None') - if e.args[0].lower().__contains__('mujoco'): return None - - bs, val_bs = 1, None - if path is None: path = './data/' + env_name.split('-v')[0].lower() + datetime.now().strftime('%Y%m%d%H%M%S') + env = gym.make(env_name) + train_ls = pickle.load(open(path / 'train.pickle', 'rb')) + train_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, x=train_ls) - return cls.create(train_list, valid_list, num_workers=num_workers, test_ds=test_ds, path=path, bs=bs, - feed_type=feed_type, val_bs=val_bs, dl_tfms=dl_tfms, device=device, collate_fn=collate_fn, - no_check=no_check, **dl_kwargs) + if add_valid: + valid_ls = pickle.load(open(path / 'valid.pickle', 'rb')) + valid_list = MDPDataset(env, max_steps=max_steps, render=render, bs=bs, x=valid_ls) + else: + valid_list = None - @classmethod - def from_csv(cls, env_name='CartPole-v1', max_steps=None, render='rgb_array', test_ds: Optional[Dataset] = None, - path: PathOrStr = None, bs: int = 1, feed_type=FEED_TYPE_STATE, val_bs: int = None, - num_workers: int = 0, - dl_tfms: Optional[Collection[Callable]] = None, device: torch.device = None, - collate_fn: Callable = data_collate, no_check: bool = False, add_valid=True, **dl_kwargs): - raise NotImplementedError('Not implemented for now. Saving state data into a csv seems extremely clunky.' - ' Suggested to use to_pickle and from_pickle due to easier numpy conversion.') + if path is None: path = './data/' + env_name.split('-v')[0].lower() + datetime.now().strftime('%Y%m%d%H%M%S') + + return cls.create(train_list, valid_list, num_workers=num_workers, path=path, bs=bs, feed_type=feed_type, + val_bs=1, device=device, **dl_kwargs) @classmethod - def create(cls, train_ds: MDPDataset, valid_ds: MDPDataset = None, - test_ds: Optional[Dataset] = None, path: PathOrStr = '.', bs: int = 1, - feed_type=FEED_TYPE_STATE, - val_bs: int = None, num_workers: int = defaults.cpus, dl_tfms: Optional[Collection[Callable]] = None, - device: torch.device = None, collate_fn: Callable = data_collate, no_check: bool = False, - **dl_kwargs) -> 'DataBunch': + def create(cls, train_ds: MDPDataset, valid_ds: MDPDataset = None, bs: int = 1, + num_workers: int = defaults.cpus, device: torch.device = None, **dl_kwargs) -> 'DataBunch': """Create a `DataBunch` from `train_ds`, `valid_ds` and maybe `test_ds` with a batch size of `bs`. Passes `**dl_kwargs` to `DataLoader()` Since this is a MarkovProcess, the batches need to be `bs=1` (for now...) """ - train_ds.feed_type = feed_type - if valid_ds is not None: valid_ds.feed_type = feed_type - if test_ds is not None: test_ds.feed_type = feed_type - - datasets = cls._init_ds(train_ds, valid_ds, test_ds) - val_bs = ifnone(val_bs, bs) + datasets = cls._init_ds(train_ds, valid_ds, None) dls = [DataLoader(d, b, shuffle=s, drop_last=s, num_workers=num_workers, **dl_kwargs) for d, b, s in - zip(datasets, (bs, val_bs, val_bs, val_bs), (False, False, False, False)) if d is not None] - databunch = cls(*dls, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check) + zip(datasets, (bs, bs, bs, bs), (False, False, False, False)) if d is not None] + databunch = cls(*dls, **dl_kwargs) if valid_ds is None: databunch.valid_dl = None return databunch @@ -474,15 +553,13 @@ def to_pickle(self): @staticmethod def _init_ds(train_ds: Dataset, valid_ds: Dataset, test_ds: Optional[Dataset] = None): - # train_ds, but without training tfms - fix_ds = copy(train_ds) # valid_ds.new(train_ds.x) if hasattr(valid_ds, 'new') else train_ds - return [o for o in (train_ds, valid_ds, fix_ds, test_ds) if o is not None] + return [o for o in (train_ds, valid_ds, copy(train_ds), test_ds) if o is not None] -class MarkovDecisionProcessList(ItemList): +class MDPList(ItemList): _bunch = MDPDataBunch - def __init__(self, items=np.array([]), feed_type=FEED_TYPE_IMAGE, **kwargs): + def __init__(self, items: Iterator, **kwargs): """ Represents a MDP sequence over episodes. @@ -490,7 +567,7 @@ def __init__(self, items=np.array([]), feed_type=FEED_TYPE_IMAGE, **kwargs): Notes: Two important fields for you to be aware of: `items` and `x`. `x` is just the values being used for directly being feed into the model. - `items` contains an ndarray of MarkovDecisionProcessSlice instances. These contain the the primary values + `items` contains an ndarray of MarkovDecisionProcessSliceAlpha instances. These contain the the primary values in x, but also the other important properties of a MDP. Args: @@ -498,73 +575,32 @@ def __init__(self, items=np.array([]), feed_type=FEED_TYPE_IMAGE, **kwargs): feed_type: **kwargs: """ - super(MarkovDecisionProcessList, self).__init__(items, **kwargs) - self.feed_type = feed_type - self.copy_new.append('feed_type') - self.ignore_empty = True + super().__init__(items, **kwargs) self.info = {} + def set_recent_run_episode(self, episode): + for i, item in enumerate(reversed(self.items)): + if item.d and i != 0: break + item.episode = episode + self._update_info(episode, item) + def clean(self, episode): - for item in self.items: - if item.episode == episode: item.clean() + self.info[episode][1] = True + [item.clean() for item in self.items if item.episode == episode] - def add(self, items: 'ItemList'): - # Update the episode related composition information - for item in items.items: - if item.episode in self.info: self.info[item.episode] = float(np.sum(self.info[item.episode] + item.reward)) - else: self.info[item.episode] = float(item.reward) + def _update_info(self, ep, item: MDPStep): + self.info[ep] = float(np.sum(self.info[ep][0] + float(item.reward))) if ep in self.info else float(item.reward) + self.info[ep] = [self.info[ep], False] + def add(self, items: 'ItemList'): + [self._update_info(item.episode, item) for item in items.items] super().add(items) - def to_df(self): - return pd.DataFrame([i.obj for i in self.items]) + def to_df(self): return pd.DataFrame([i.obj for i in self.items]) - def to_dict(self): - return [i.obj for i in self.items] + def to_dict(self): return [i.obj for i in self.items] - def get(self, i): - return self.items[i].data + def get(self, i): return self.items[i].data def reconstruct(self, t: Tensor, x: Tensor = None): - if self.feed_type == FEED_TYPE_IMAGE: - return MarkovDecisionProcessSlice(state=Image(t), state_prime=Image(x[0]), - alt_state=Floats(x[1]), action=Floats(x[1]), - reward=Floats(x[2]), done=x[3], feed_type=self.feed_type) - else: - return MarkovDecisionProcessSlice(state=Floats(t), state_prime=Floats(x[0]), - alt_state=Image(x[1]), action=Floats(x[1]), - reward=Floats(x[2]), done=x[3], feed_type=self.feed_type) - - -class MarkovDecisionProcessSlice(ItemBase): - # noinspection PyMissingConstructor - def __init__(self, state, state_prime, alt_state, action, reward, done, episode, raw_action, - feed_type=FEED_TYPE_IMAGE): - action = np.copy(action) - raw_action = np.copy(raw_action) - if len(action.shape) == 0: action = np.array(action, ndmin=1) - if isinstance(np.copy(action), int): action = np.array(action, ndmin=1) - if isinstance(reward, float) or isinstance(reward, int): reward = np.array(reward, ndmin=1) - self.current_state, self.result_state, self.alternate_state, self.actions, self.reward, self.done, self.episode, self.raw_action = state, state_prime, alt_state, action, reward, done, episode, raw_action - self.data, self.obj = alt_state if feed_type == FEED_TYPE_IMAGE else state, \ - {'state': self.current_state, 'state_prime': self.result_state, - 'alt_state': self.alternate_state, 'action': action, 'reward': reward, 'done': done, - 'episode': episode, 'feed_type': feed_type, 'raw_action': raw_action} - - def clean(self, only_alt=False): - if not only_alt: - self.current_state, self.result_state = None, None - self.obj['state'], self.obj['state_prime'] = None, None - - self.alternate_state, self.obj['alt_state'] = None, None - - def __str__(self): - formatted = ( - map(lambda y: f'{y}:{self.obj[y].shape}', filter(lambda y: y.__contains__('state'), self.obj.keys())), - map(lambda y: f'{y}:{self.obj[y]}', filter(lambda y: not y.__contains__('state'), self.obj.keys())) - ) - - return ', '.join(list(formatted[0]) + list(formatted[1])) - - def to_one(self): - return Image(self.alternate_state) + raise NotImplementedError('Not sure when this will be important.') diff --git a/fast_rl/core/agent_core.py b/fast_rl/core/agent_core.py index 4836eb0..cd9ffe2 100644 --- a/fast_rl/core/agent_core.py +++ b/fast_rl/core/agent_core.py @@ -4,17 +4,12 @@ from collections import deque from functools import partial from math import ceil -from typing import List, Optional +from typing import List import gym import numpy as np import torch from fastai.basic_train import * -from fastai.basic_train import BasicLearner, CallbackList, OptMetrics, master_bar, is_listy, first_el, to_np -from fastai.callback import CallbackHandler -from fastprogress import progress_bar - -from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSlice from fast_rl.core.data_structures import SumTree @@ -58,7 +53,7 @@ def __init__(self, epsilon_start, epsilon_end, decay, start_episode=0, end_episo def perturb(self, action, action_space: gym.Space): """ - TODO for now does random discrete selection. Move to continuous soon. + TODO for now does random discrete selection. Move to discrete soon. Args: action: @@ -127,8 +122,8 @@ def refresh(self, **kwargs): class ExperienceReplay(Experience): def __init__(self, memory_size, **kwargs): - """ - Basic store-er of state space transitions for training agents. + r""" + Basic store-er of s space transitions for training agents. References: [1] Mnih, Volodymyr, et al. "Playing atari with deep reinforcement learning." @@ -139,7 +134,7 @@ def __init__(self, memory_size, **kwargs): """ super().__init__(memory_size, **kwargs) self.max_size = memory_size - self.memory = deque(maxlen=memory_size) # type: List[MarkovDecisionProcessSlice] + self.memory = deque(maxlen=memory_size) # type: List[MarkovDecisionProcessSliceAlpha] def __len__(self): return len(self.memory) @@ -149,7 +144,7 @@ def sample(self, batch, **kwargs): return random.sample(self.memory, batch) def update(self, item, **kwargs): - if self.reduce_ram: item.clean(True) + if self.reduce_ram: item.clean() self.memory.append(copy.deepcopy(item)) @@ -218,7 +213,7 @@ def update(self, item, **kwargs): """ maximal_priority = self.alpha - if self.reduce_ram: item.clean(True) + if self.reduce_ram: item.clean() self.memory.add(np.abs(maximal_priority) + self.epsilon, item) @@ -234,91 +229,3 @@ def __init__(self, memory_size): memory_size: """ super().__init__(memory_size) - - -def loss_batch(model, cb_handler: Optional[CallbackHandler]): - """ TODO will be different. Is there anything extra needed from here? """ - if model.out is not None: cb_handler.on_loss_begin(model.out) - if model.loss is None: return None - cb_handler.on_backward_begin(model.loss) - cb_handler.on_backward_end() - cb_handler.on_step_end() - return model.loss.detach().cpu() - - -def validate(learn, dl, cb_handler: Optional[CallbackHandler] = None, - pbar=None, average=True, n_batch: Optional[int] = None): - learn.model.eval() - with torch.no_grad(): - val_losses, nums = [], [] - if cb_handler: cb_handler.set_dl(dl) - # TODO 1st change: in fit function, original uses xb, yb. Maybe figure out what 2nd value to include? - for element in progress_bar(dl, parent=pbar): - learn.data.valid_ds.actions = learn.predict(element) - if cb_handler: element = cb_handler.on_batch_begin(element, learn.data.valid_ds.actions, train=False) - val_loss = loss_batch(learn.model, cb_handler=cb_handler) - if val_loss is None: continue - val_losses.append(val_loss) - r = dl.actions if is_listy(dl.actions) else np.array([dl.actions]) - nums.append(first_el(r).shape[0]) - if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break - if n_batch and (len(nums) >= n_batch): break - nums = np.array(nums, dtype=np.float32) - if average and val_losses: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum() - else: return val_losses - - -def fit(epochs: int, learn: BasicLearner, callbacks: Optional[CallbackList] = None, metrics: OptMetrics = None): - """ - Takes a RL Learner and trains it on a given environment. - - Important behavior notes: - - - The original Fastai fit function outputs the loss for every epoch. Since many RL models need to fill a memory buffer before optimization, the fit function for epoch 0 will run multiple episodes until the memory is filled. - - **So when you see the agent running through episodes without output, please note that it is most likely filling the memory first before properly printing.** - - Args: - epochs: - learn: - callbacks: - metrics: - - Returns: - - """ - assert len(learn.data.train_dl) != 0, f"""Your training dataloader is empty, can't train a model. Use a smaller - batch size (batch size={learn.data.train_dl.batch_size} for {len(learn.data.train_dl.dataset)} elements).""" - # Since CallbackHandler is a dataclass, these input fields will be automatically populated via - # a default __init__ - cb_handler = CallbackHandler(callbacks, metrics) - cb_handler.state_dict['skip_validate'] = learn.data.empty_val - pbar = master_bar(range(epochs)) - cb_handler.on_train_begin(epochs, pbar=pbar, metrics=metrics) - # Note that metrics in the on_train begin method need a name field. - exception = False - loss = None - try: - for epoch in pbar: - learn.model.train() - cb_handler.set_dl(learn.data.train_dl) - cb_handler.on_epoch_begin() - # TODO 1st change: While the loss is None, the model's memory has not filled and / or is not ready. - while loss is None: - # TODO 2nd change: in fit function, original uses xb, yb. Maybe figure out what 2nd value to include? - for element in progress_bar(learn.data.train_dl, parent=pbar): - # TODO 3rd change: get the action for the given state. Move to on batch begin callback? - learn.data.train_ds.actions = learn.predict(element) - cb_handler.on_batch_begin(element, learn.data.train_ds.actions) - # TODO 4th change: loss_batch is way simpler... What is a batch to be defined as? - loss = loss_batch(learn.model, cb_handler) - if cb_handler.on_batch_end(loss): break - - loss = None - if not cb_handler.skip_validate and not learn.data.empty_val: - val_loss = validate(learn, learn.data.valid_dl, cb_handler=cb_handler, pbar=pbar) - else: val_loss = None - if cb_handler.on_epoch_end(val_loss): break - except Exception as e: - exception = e - raise - finally: cb_handler.on_train_end(exception) diff --git a/fast_rl/core/basic_train.py b/fast_rl/core/basic_train.py new file mode 100644 index 0000000..1a43d8c --- /dev/null +++ b/fast_rl/core/basic_train.py @@ -0,0 +1,31 @@ +from fastai.basic_train import Learner + + +class WrapperLossFunc(object): + def __init__(self, learn): + self.learn = learn + + def __call__(self, *args, **kwargs): + return self.learn.model.loss + + +class AgentLearner(Learner): + + def __post_init__(self) -> None: + super().__post_init__() + self._loss_func = WrapperLossFunc(self) + self.loss_func = None + self.callback_fns += self.model.learner_callbacks + self.data.train_ds.callback + + def predict(self, element, **kwargs): + return self.model.pick_action(element) + + def init_loss_func(self): + r""" + Initializes the loss function wrapper for logging loss. + + Since most RL models have a period of warming up such as filling memory buffers, we cannot log any loss. + By default, the learner will have a `None` loss function, and so the fit function will not try to log that + loss. + """ + self.loss_func = self._loss_func diff --git a/fast_rl/tests/test_DataBunch.py b/fast_rl/tests/test_DataBunch.py index d80bb26..09cd915 100644 --- a/fast_rl/tests/test_DataBunch.py +++ b/fast_rl/tests/test_DataBunch.py @@ -3,23 +3,23 @@ from fast_rl.util.file_handlers import get_absolute_path -def test_ImageDataBunch_init(): - """ - For understanding various databunches. - - For example, ImageDataBunch in the from folder: - - Src is originally an ImageList, but the following code: - - `src = src.label_from_folder(classes=classes)` - - CHANGES THE CLASS TO A LABELLISTS?!?!? - - In other words, the ImageList is capable of turning into a dataset. - - :return: - """ - data = ImageDataBunch.from_folder(get_absolute_path('data'), valid_pct=0.5) - - for e in data.train_ds: - print(e) \ No newline at end of file +# def test_ImageDataBunch_init(): +# """ +# For understanding various databunches. +# +# For example, ImageDataBunch in the from folder: +# +# Src is originally an ImageList, but the following code: +# +# `src = src.label_from_folder(classes=classes)` +# +# CHANGES THE CLASS TO A LABELLISTS?!?!? +# +# In other words, the ImageList is capable of turning into a dataset. +# +# :return: +# """ +# data = ImageDataBunch.from_folder(get_absolute_path('data'), valid_pct=0.5) +# +# for e in data.train_ds: +# print(e) \ No newline at end of file diff --git a/fast_rl/tests/test_Envs.py b/fast_rl/tests/test_Envs.py index 7e65d2b..588ca46 100644 --- a/fast_rl/tests/test_Envs.py +++ b/fast_rl/tests/test_Envs.py @@ -2,47 +2,47 @@ import numpy as np import pytest from fast_rl.core.Envs import Envs -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch +# from fast_rl.core.MarkovDecisionProcess import MDPDataBunchAlpha -def test_individual_env(): - msg = 'the datasets in the dataloader seem to be different from the data bunches datasets...' +# def test_individual_env(): +# msg = 'the datasets in the dataloader seem to be different from the data bunches datasets...' - max_steps = 50 +# max_steps = 50 - env = 'CarRacing-v0' - print(f'Testing {env}') - mdp_databunch = MDPDataBunch.from_env(env, max_steps=max_steps, num_workers=0) - epochs = 1 +# env = 'CarRacing-v0' +# print(f'Testing {env}') +# mdp_databunch = MDPDataBunchAlpha.from_env(env, max_steps=max_steps, num_workers=0) +# epochs = 1 - assert max_steps == len(mdp_databunch.train_dl) - assert max_steps == len(mdp_databunch.valid_dl) +# assert max_steps == len(mdp_databunch.train_dl) +# assert max_steps == len(mdp_databunch.valid_dl) - for epoch in range(epochs): - for _ in mdp_databunch.train_dl: - mdp_databunch.train_ds.actions = mdp_databunch.train_ds.get_random_action() - # print(f'state {element.shape} action {mdp_databunch.train_dl.dl.dataset.actions}') - assert np.sum( - np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size( - mdp_databunch.train_ds.actions), msg +# for epoch in range(epochs): +# for _ in mdp_databunch.train_dl: +# mdp_databunch.train_ds.actions = mdp_databunch.train_ds.get_random_action() +# # print(f's {element.shape} action {mdp_databunch.train_dl.dl.dataset.actions}') +# assert np.sum( +# np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size( +# mdp_databunch.train_ds.actions), msg - for _ in mdp_databunch.valid_dl: - mdp_databunch.valid_ds.actions = mdp_databunch.valid_ds.get_random_action() - # print(f'state {element.shape} action {mdp_databunch.valid_dl.dl.dataset.actions}') - assert np.sum( - np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size( - mdp_databunch.train_ds.actions), msg +# for _ in mdp_databunch.valid_dl: +# mdp_databunch.valid_ds.actions = mdp_databunch.valid_ds.get_random_action() +# # print(f's {element.shape} action {mdp_databunch.valid_dl.dl.dataset.actions}') +# assert np.sum( +# np.equal(mdp_databunch.train_dl.dl.dataset.actions, mdp_databunch.train_ds.actions)) == np.size( +# mdp_databunch.train_ds.actions), msg -def test_individual_env_no_dl(): - """Just a nice place to do sanity testing on new / untested envs.""" - env = gym.make('maze-random-10x10-plus-v0') - for episode in range(2): - done = False - env.reset() - while not done: - output = env.step(env.action_space.sample()) - done = output[2] - env.render('human') +# def test_individual_env_no_dl(): +# """Just a nice place to do sanity testing on new / untested envs.""" +# env = gym.make('maze-random-10x10-plus-v0') +# for episode in range(2): +# done = False +# env.reset() +# while not done: +# output = env.step(env.action_space.sample()) +# done = output[2] +# env.render('human') diff --git a/fast_rl/tests/test_Interpretation.py b/fast_rl/tests/test_Interpretation.py index 8412722..139597f 100644 --- a/fast_rl/tests/test_Interpretation.py +++ b/fast_rl/tests/test_Interpretation.py @@ -1,52 +1,2 @@ -from typing import Collection -from fastai.basic_data import DatasetType -from fast_rl.agents.DDPG import DDPG -from fast_rl.agents.DQN import DQN, FixedTargetDQN -from fast_rl.core.Interpreter import AgentInterpretationAlpha -from fast_rl.core.Learner import AgentLearner -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_IMAGE, FEED_TYPE_STATE - - -# def test_interpretation_heatmap(): -# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, -# feed_type=FEED_TYPE_STATE, add_valid=False, memory_management_strategy='non') -# model = DQN(data, batch_size=8) -# learn = AgentLearner(data, model) -# -# learn.fit(5) -# interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train) -# interp.plot_heatmapped_episode(-1, action_index=0) - - -def test_interpretation_plot_q_dqn_returns(): - data = MDPDataBunch.from_env('maze-random-5x5-v0', max_steps=100, render='human', add_valid=False, - memory_management_strategy='non') - model = DQN(data) - learn = AgentLearner(data, model) - learn.fit(5) - interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train) - interp.plot_heatmapped_episode(2) - - -# def test_inerpretation_plot_model_accuracy_fixeddqn(): -# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False, -# memory_management_strategy='non') -# model = FixedTargetDQN(data, batch_size=64, max_episodes=100, copy_over_frequency=4) -# learn = AgentLearner(data, model) -# -# learn.fit(5) -# interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train) -# interp.plot_agent_accuracy_density() - - -# def test_interpretation_plot_q_density(): -# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False, -# memory_management_strategy='non') -# model = FixedTargetDQN(data, batch_size=128, max_episodes=100, copy_over_frequency=3, use_embeddings=True) -# learn = AgentLearner(data, model) -# -# learn.fit(4) -# interp = AgentInterpretationAlpha(learn, ds_type=DatasetType.Train) -# interp.plot_agent_accuracy_density() diff --git a/fast_rl/tests/test_MDPDataBunch.py b/fast_rl/tests/test_MDPDataBunch.py index 0d2bc75..7b5bf3c 100644 --- a/fast_rl/tests/test_MDPDataBunch.py +++ b/fast_rl/tests/test_MDPDataBunch.py @@ -1,80 +1,199 @@ +from functools import partial + import gym -from fastai.basic_data import DataBunch -from fastai.basic_train import Learner, ItemLists -from fastai.vision import ImageDataBunch import numpy as np - -from fast_rl.agents.DQN import FixedTargetDQN -from fast_rl.core.Learner import AgentLearner -from fast_rl.core.MarkovDecisionProcess import MarkovDecisionProcessSlice, MarkovDecisionProcessList, \ - MDPDataBunch, MDPDataset -from fast_rl.core.agent_core import ExperienceReplay - - -def test_MarkovDecisionProcessDataBunch_init(): - error_msg = 'state space is %s but should be %s' - error_msg2 = 'the datasets in the dataloader seem to be different from the data bunches datasets...' - - max_steps = 50 - # Create 2 itemlists - train_list = MDPDataset(gym.make('CartPole-v1'), max_steps=max_steps) - valid_list = MDPDataset(gym.make('CartPole-v1'), max_steps=max_steps) - - env_databunch = MDPDataBunch.create(train_list, valid_list, num_workers=0) - epochs = 1 - - assert max_steps == len(train_list) - assert max_steps == len(train_list) - assert max_steps == len(env_databunch.train_dl) - assert max_steps == len(env_databunch.valid_dl) - - for epoch in range(epochs): - for element in env_databunch.train_dl: - env_databunch.train_ds.actions = env_databunch.train_ds.env.action_space.sample() - current_s, actual_s = element.shape[1], train_list.env.observation_space.shape[0] - print(f'state {element} action {env_databunch.train_dl.dl.dataset.actions}') - assert current_s == actual_s, error_msg % (current_s, actual_s) - assert np.equal(env_databunch.train_dl.dl.dataset.actions, env_databunch.train_ds.actions), error_msg2 - - for element in env_databunch.valid_dl: - env_databunch.valid_ds.actions = env_databunch.valid_ds.env.action_space.sample() - current_s, actual_s = element.shape[1], valid_list.env.observation_space.shape[0] - print(f'state {element} action {env_databunch.valid_dl.dl.dataset.actions}') - assert current_s == actual_s, error_msg % (current_s, actual_s) - assert np.equal(env_databunch.valid_dl.dl.dataset.actions, env_databunch.valid_ds.actions), error_msg2 - - -def test_MDPDataset_MemoryManagement(): - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False) - model = FixedTargetDQN(data, batch_size=128, max_episodes=10000, lr=0.001, copy_over_frequency=3, - memory=ExperienceReplay(10000), discount=0.99, use_embeddings=True) - learn = AgentLearner(data, model, mem_strategy='k_top_best') - - learn.fit(5) - - -def test_MarkovDecisionProcessDataBunch_init_no_valid(): - error_msg = 'state space is %s but should be %s' - error_msg2 = 'the datasets in the dataloader seem to be different from the data bunches datasets...' - - max_steps = 50 - # Create 2 itemlists - train_list = MDPDataset(gym.make('CartPole-v1'), max_steps=max_steps) - - env_databunch = MDPDataBunch.create(train_list, num_workers=0) - env_databunch.valid_dl = None - epochs = 3 - - assert max_steps == len(train_list) - assert max_steps == len(train_list) - assert max_steps == len(env_databunch.train_dl) - assert env_databunch.valid_dl is None - - for epoch in range(epochs): - print(f'epoch {epoch}') - for element in env_databunch.train_dl: - env_databunch.train_ds.actions = env_databunch.train_ds.env.action_space.sample() - current_s, actual_s = element.shape[1], train_list.env.observation_space.shape[0] - print(f'state {element} action {env_databunch.train_dl.dl.dataset.actions}') - assert current_s == actual_s, error_msg % (current_s, actual_s) - assert np.equal(env_databunch.train_dl.dl.dataset.actions, env_databunch.train_ds.actions), error_msg2 +import pytest +import torch +from fastai.basic_train import ItemLists +from gym import error +from gym.envs.algorithmic.algorithmic_env import AlgorithmicEnv +from gym.envs.toy_text import discrete +from gym.wrappers import TimeLimit + +from fast_rl.agents.DQN import DQN +from fast_rl.core.Envs import Envs +from fast_rl.core.MarkovDecisionProcess import Action, Bounds, State, MDPDataset, MDPDataBunch, MDPMemoryManager +from fast_rl.core.basic_train import AgentLearner +from fast_rl.util.exceptions import MaxEpisodeStepsMissingError +from fast_rl.util.misc import list_in_str + +ENV_NAMES = Envs.get_all_latest_envs() + + +def validate_item_list(item_list: ItemLists): + # Check items + for i, item in enumerate(item_list.items): + if item.done: assert not item_list.items[ + i - 1].done, f'The dataset has duplicate "done\'s" that are consecutive.' + assert item.state.s is not None, f'The item: {item}\'s state is None' + assert item.state.s_prime is not None, f'The item: {item}\'s state prime is None' + + +@pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +def test_mdp_clean_callback(env): + data = MDPDataBunch.from_env(env, render='rgb_array') + model = DQN(data) + learner = AgentLearner(data, model) + learner.fit(15) + data.train_ds.env.close() + data.valid_ds.env.close() + del learner +# +# +# @pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +# def test_mdp_databunch(env): +# data = MDPDataBunch.from_env(env, add_valid=False, render='rgb_array') +# for i in range(5): +# for _ in data.train_ds: +# data.train_ds.action = Action(taken_action=data.train_ds.action.action_space.sample(), +# action_space=data.train_ds.action.action_space) +# +# validate_item_list(data.train_ds.x) +# del data +# +# +# @pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +# def test_mdp_dataset_iter(env): +# dataset = MDPDataset(gym.make(env), memory_manager = partial(MDPMemoryManager, strategy='k_partition_top'), bs=8, +# render='rgb_array') +# +# for epoch in range(5): +# for el in dataset: +# dataset.action.set_single_action(dataset.env.action_space.sample()) +# +# # Check items +# for i, item in enumerate(dataset.x.items): +# if item.done: assert not dataset.x.items[ +# i - 1].done, f'The dataset has duplicate "done\'s" that are consecutive.' +# assert item.state.s is not None, f'The item: {item}\'s state is None' +# assert item.state.s_prime is not None, f'The item: {item}\'s state prime is None' +# del dataset +# +# +# @pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +# def test_mdpdataset_init(env): +# try: +# init_env = gym.make(env) +# except error.DependencyNotInstalled as e: +# print(e) +# return +# +# data = MDPDataset(init_env, None, 64, 'rgb_array') +# +# try: +# max_steps = data.max_steps +# assert max_steps is not None, f'Max steps is None for env {env}' +# except MaxEpisodeStepsMissingError as e: +# return +# +# envs_to_test = { +# 'CartPole-v0': 200, +# 'MountainCar-v0': 200, +# 'maze-v0': 2000 +# } +# +# if env in envs_to_test: +# assert envs_to_test[env] == max_steps, f'Env {env} is the wrong step amount' +# del data +# +# +# @pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +# def test_bound_init(env): +# try: +# init_env = gym.make(env) +# except error.DependencyNotInstalled as e: +# print(e) +# return +# +# for bound in (Bounds(init_env.action_space), Bounds(init_env.observation_space)): +# if env.lower().__contains__('continuous'): +# assert bound.n_possible_values == np.inf, f'Env {env} is continuous, should have inf values.' +# if env.lower().__contains__('deterministic'): +# assert bound.n_possible_values != np.inf, f'Env {env} is deterministic, should have discrete values.' +# init_env.close() +# +# @pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +# def test_action_init(env): +# try: +# init_env = gym.make(env) +# except error.DependencyNotInstalled as e: +# print(e) +# return +# +# taken_action = init_env.action_space.sample() +# raw_action = np.random.rand(len(Bounds(init_env.action_space).max)) +# init_env.reset() +# _ = init_env.step(taken_action) +# +# action = Action(taken_action=taken_action, raw_action=raw_action, action_space=init_env.action_space) +# +# if list_in_str(env, ['mountaincar-', 'cartpole', 'pong']): +# assert any([action.taken_action.dtype in (int, torch.int, torch.int64)]), f'Action is wrong dtype {action}' +# assert any([action.raw_action.dtype in (float, torch.float32, torch.float64)]), f'Action is wrong dtype {action}' +# if list_in_str(env, ['carracing', 'pendulum']): +# assert any([action.taken_action.dtype in (float, torch.float32, torch.float64)]), f'Action is wrong dtype {action}' +# assert any([action.raw_action.dtype in (float, torch.float32, torch.float64)]), f'Action is wrong dtype {action}' +# init_env.close() +# +# +# @pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +# def test_state_init(env): +# try: +# init_env = gym.make(env) +# except error.DependencyNotInstalled as e: +# print(e) +# return +# +# taken_action = init_env.action_space.sample() +# state = init_env.reset() +# state_prime, reward, done, info = init_env.step(taken_action) +# State(state, state_prime, init_env.observation_space) +# init_env.close() +# +# +# @pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +# def test_state_str(env): +# try: +# init_env = gym.make(env) +# except error.DependencyNotInstalled as e: +# print(e) +# return +# +# render = 'rgb_array' +# if isinstance(init_env, TimeLimit) and isinstance(init_env.unwrapped, (AlgorithmicEnv, discrete.DiscreteEnv)): +# render = 'ansi' if render == 'rgb_array' else render +# +# taken_action = init_env.action_space.sample() +# state = init_env.reset() +# +# try: +# alt_s = init_env.render(render) +# except NotImplementedError: +# return +# +# state_prime, reward, done, info = init_env.step(taken_action) +# alt_s_prime = init_env.render(render) +# State(state, state_prime, alt_s, alt_s_prime, init_env.observation_space).__str__() +# init_env.close() +# +# +# # @pytest.mark.parametrize("env", sorted(['CartPole-v0', 'maze-random-5x5-v0'])) +# # def test_state_full_episode(env): +# # try: +# # init_env = gym.make(env) +# # except error.DependencyNotInstalled as e: +# # print(e) +# # return +# # +# # done = False +# # state = init_env.reset() +# # while not done: +# # taken_action = init_env.action_space.sample() +# # alt_state = init_env.render('rgb_array') +# # state_prime, reward, done, info = init_env.step(taken_action) +# # alt_s_prime = init_env.render('rgb_array') +# # State(state, state_prime, alt_state, alt_s_prime, init_env.observation_space) +# # state = state_prime +# # if done: +# # assert state_prime is not None, 'State prime is None, this should not have happened.' +# # init_env.close() diff --git a/fast_rl/tests/test_agent_core.py b/fast_rl/tests/test_agent_core.py index 2a5dd32..c55c40e 100644 --- a/fast_rl/tests/test_agent_core.py +++ b/fast_rl/tests/test_agent_core.py @@ -1,32 +1,34 @@ -from fastai.basic_train import LearnerCallback, DatasetType -from fastai.callback import Callback -from fastai.tabular import tabular_learner -from fastai.vision import cnn_learner, models +import pytest -import numpy as np -from traitlets import List -from typing import Collection - -from fast_rl.agents.BaseAgent import BaseAgent from fast_rl.agents.DDPG import DDPG -from fast_rl.agents.DQN import DQN, FixedTargetDQN, DoubleDQN, DuelingDQN, DoubleDuelingDQN -from fast_rl.core.Interpreter import AgentInterpretationAlpha -from fast_rl.core.Learner import AgentLearner -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_STATE -from fast_rl.core.agent_core import PriorityExperienceReplay, ExperienceReplay +from fast_rl.agents.DQN import DQN, FixedTargetDQN +from fast_rl.core.MarkovDecisionProcess import MDPDataBunch +from fast_rl.core.agent_core import PriorityExperienceReplay +from fast_rl.core.basic_train import AgentLearner def test_priority_experience_replay(): data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False) model = FixedTargetDQN(data, memory=PriorityExperienceReplay(1000)) learn = AgentLearner(data, model) - learn.fit(5) - - -# def test_fit_function_ddpg(): -# data = MDPDataBunch.from_env('Pendulum-v0', render='human', max_steps=100, add_valid=False) -# model = DDPG(data, memory=PriorityExperienceReplay(1000)) -# learn = AgentLearner(data, model) -# learn.fit(5) + learn.fit(3) + data.train_ds.env.close() + + +@pytest.mark.parametrize("env", sorted(['CartPole-v0'])) +def test_databunch_dqn_fit(env): + data = MDPDataBunch.from_env(env) + model = DQN(data) + learner = AgentLearner(data=data, model=model) + learner.fit(3) + data.valid_ds.env.close() + data.train_ds.env.close() + +def test_fit_function_ddpg(): + data = MDPDataBunch.from_env('Pendulum-v0', bs=4, render='human', max_steps=100, add_valid=False) + model = DDPG(data, memory=PriorityExperienceReplay(1000)) + learn = AgentLearner(data, model) + learn.fit(3) + data.train_ds.env.close() diff --git a/fast_rl/tests/test_ddpg_models.py b/fast_rl/tests/test_ddpg_models.py index eaa29f6..b72a5c5 100644 --- a/fast_rl/tests/test_ddpg_models.py +++ b/fast_rl/tests/test_ddpg_models.py @@ -1,36 +1,28 @@ from collections import Collection +from functools import partial +from itertools import product import pytest from fastai.basic_train import LearnerCallback from fast_rl.agents.DDPG import DDPG from fast_rl.core.Envs import Envs -from fast_rl.core.Learner import AgentLearner -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch +from fast_rl.core.MarkovDecisionProcess import FEED_TYPE_IMAGE, FEED_TYPE_STATE, MDPDataBunch from fast_rl.core.agent_core import ExperienceReplay, OrnsteinUhlenbeck +from fast_rl.core.basic_train import AgentLearner +params_dqn = [DDPG] +params_envs = ['Pendulum-v0', 'CarRacing-v0'] +params_state_format = [FEED_TYPE_STATE, FEED_TYPE_IMAGE] -ENV_NAMES = Envs.get_all_latest_envs(exclude_key='pybullet') - -@pytest.mark.parametrize("env", sorted(ENV_NAMES)) -def test_all_ddpg(env): - data = MDPDataBunch.from_env(env, render='human', add_valid=False, - max_steps=50) - if data is None: - print(f'Env {env} is probably Mujoco... Add imports if you want and try on your own. Don\'t like ' - f'proprietary engines like this. If you have any issues, feel free to make a PR!') - return - # If the action type is int, skip. While there is a capability of int actions spaces for DDPG, - # that functionality is more of a niche - if data.train_ds.env.action_space.dtype == int: return - # If the dtype is None, then it is most likely a Tuple action space. We will most likely open this - # functionality in the future. - if data.train_ds.env.action_space.dtype is None: return - - # data = MDPDataBunch.from_env('MountainCarContinuous-v0', render='human') - model = DDPG(data=data, batch=8, memory=ExperienceReplay(200, reduce_ram=True), - exploration_strategy=OrnsteinUhlenbeck(epsilon_start=1, epsilon_end=0.1, decay=0.0001, size=1, - do_exploration=True, end_episode=450)) - learn = AgentLearner(data, model) - learn.fit(5) +@pytest.mark.parametrize(["env", "model", "s_format"], list(product(params_envs, params_dqn, params_state_format))) +def test_ddpg_models(env, model, s_format): + model = partial(model, memory=ExperienceReplay(memory_size=1000, reduce_ram=True)) + data = MDPDataBunch.from_env(env, render='rgb_array', max_steps=20, bs=4, add_valid=False, feed_type=s_format) + learn = AgentLearner(data, model(data)) + learn.fit(3) + data.train_ds.env.close() + del learn + del model + del data diff --git a/fast_rl/tests/test_dqn_models.py b/fast_rl/tests/test_dqn_models.py index cc0fcd9..60a3689 100644 --- a/fast_rl/tests/test_dqn_models.py +++ b/fast_rl/tests/test_dqn_models.py @@ -1,83 +1,32 @@ +from functools import partial +from itertools import product -def test_basic_dqn_model_maze(): - from fast_rl.agents.DQN import DQN - from fast_rl.core.Learner import AgentLearner - from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100) - model = DQN(data) - learn = AgentLearner(data, model) +import pytest - learn.fit(5) +from fast_rl.agents.DQN import DQN, FixedTargetDQN, DoubleDQN, DuelingDQN, DoubleDuelingDQN +from fast_rl.core.MarkovDecisionProcess import MDPDataBunch, FEED_TYPE_STATE, FEED_TYPE_IMAGE +from fast_rl.core.agent_core import ExperienceReplay +from fast_rl.core.basic_train import AgentLearner -def test_fixed_target_dqn_model_maze(): - from fast_rl.agents.DQN import FixedTargetDQN - from fast_rl.core.Learner import AgentLearner - from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - print('\n') - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False) - model = FixedTargetDQN(data) - learn = AgentLearner(data, model) - - learn.fit(5) - -def test_fixed_target_dqn_model_cartpole(): - from fast_rl.agents.DQN import FixedTargetDQN - from fast_rl.core.Learner import AgentLearner - from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - from fast_rl.core.agent_core import ExperienceReplay - print('\n') - data = MDPDataBunch.from_env('CartPole-v1', render='human', max_steps=100, add_valid=False) - model = FixedTargetDQN(data, memory=ExperienceReplay(memory_size=100000, reduce_ram=True)) - learn = AgentLearner(data, model) - - learn.fit(5) +params_dqn = [DuelingDQN, DoubleDQN, DQN, FixedTargetDQN, DoubleDuelingDQN] +params_envs = ['CartPole-v0', 'MountainCar-v0', 'Pong-v0'] +params_state_format = [FEED_TYPE_STATE, FEED_TYPE_IMAGE] -def test_fixed_target_dqn_no_explore_model_maze(): - from fast_rl.agents.DQN import FixedTargetDQN - from fast_rl.core.Learner import AgentLearner - from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - from fast_rl.core.agent_core import GreedyEpsilon +@pytest.mark.parametrize(["env", "model", "s_format"], list(product(params_envs, params_dqn, params_state_format))) +def test_dqn_models(env, model, s_format): + model = partial(model, memory=ExperienceReplay(memory_size=1000, reduce_ram=True)) print('\n') - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100, add_valid=False) - model = FixedTargetDQN(data, lr=0.01, discount=0.8, - exploration_strategy=GreedyEpsilon(epsilon_start=0, epsilon_end=0, - decay=0.001, do_exploration=False)) - learn = AgentLearner(data, model) - - learn.fit(5) - - -def test_double_dqn_model_maze(): - from fast_rl.agents.DQN import DoubleDQN - from fast_rl.core.Learner import AgentLearner - from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100) - model = DoubleDQN(data, batch_size=8) - learn = AgentLearner(data, model) - - learn.fit(5) - - -def test_dueling_dqn_model_maze(): - from fast_rl.agents.DQN import DuelingDQN - from fast_rl.core.Learner import AgentLearner - from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100) - model = DuelingDQN(data, batch_size=8) - learn = AgentLearner(data, model) - learn.fit(5) + data = MDPDataBunch.from_env(env, render='rgb_array', max_steps=20, bs=4, add_valid=False, + feed_type=s_format) -def test_double_dueling_dqn_model_maze(): - from fast_rl.agents.DQN import DoubleDuelingDQN - from fast_rl.core.Learner import AgentLearner - from fast_rl.core.MarkovDecisionProcess import MDPDataBunch + learn = AgentLearner(data, model(data)) - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100) - model = DoubleDuelingDQN(data) - learn = AgentLearner(data, model) + data.train_ds.env.close() - learn.fit(5) + learn.fit(3) + del learn + del model + del data diff --git a/fast_rl/tests/test_metrics.py b/fast_rl/tests/test_metrics.py index 046ba48..e69de29 100644 --- a/fast_rl/tests/test_metrics.py +++ b/fast_rl/tests/test_metrics.py @@ -1,10 +0,0 @@ -from fast_rl.agents.DQN import FixedTargetDQN -from fast_rl.core.Learner import AgentLearner -from fast_rl.core.MarkovDecisionProcess import MDPDataBunch - - -def test_epsilon(): - data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=100) - model = FixedTargetDQN(data, batch_size=8) - learn = AgentLearner(data, model) - learn.fit(5) \ No newline at end of file diff --git a/fast_rl/util/exceptions.py b/fast_rl/util/exceptions.py new file mode 100644 index 0000000..28cfd80 --- /dev/null +++ b/fast_rl/util/exceptions.py @@ -0,0 +1,4 @@ + + +class MaxEpisodeStepsMissingError(Exception): + pass \ No newline at end of file diff --git a/fast_rl/util/misc.py b/fast_rl/util/misc.py index d0c3051..4ca9ecf 100644 --- a/fast_rl/util/misc.py +++ b/fast_rl/util/misc.py @@ -13,6 +13,11 @@ class b_colors: UNDERLINE = '\033[4m' +def list_in_str(s: str, str_list: list, make_lower=True): + if make_lower: s = s.lower() + return any([s.__contains__(el) for el in str_list]) + + def is_goal_env(env, suppress_errors=True): msg = 'GoalEnv requires the "{}" key to be part of the observation d.' # Enforce that each GoalEnv uses a Goal-compatible observation space. diff --git a/fast_rl/util/random_thingy.py b/fast_rl/util/random_thingy.py index c2edfef..a82e664 100644 --- a/fast_rl/util/random_thingy.py +++ b/fast_rl/util/random_thingy.py @@ -5,25 +5,12 @@ interp.plot_heatmapped_episode(-1) """ -from fastai.basic_data import DatasetType -from fast_rl.agents.DDPG import DDPG +from fast_rl.core.basic_train import AgentLearner from fast_rl.agents.DQN import FixedTargetDQN -from fast_rl.core.Learner import AgentLearner from fast_rl.core.MarkovDecisionProcess import MDPDataBunch +from fast_rl.core.agent_core import ExperienceReplay -from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck, ExperienceReplay -from fast_rl.core.metrics import EpsilonMetric - -data = MDPDataBunch.from_env('CartPole-v1', render='human', add_valid=False) -# data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False) -# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False) - -model = FixedTargetDQN(data, batch_size=64, memory=ExperienceReplay(memory_size=40000, reduce_ram=True), - exploration_strategy=GreedyEpsilon(decay=0.001, epsilon_start=1, epsilon_end=0.01)) -# model = DDPG(data=data, batch=128, memory=ExperienceReplay(40000, reduce_ram=True), -# exploration_strategy=OrnsteinUhlenbeck(epsilon_start=1, epsilon_end=0.1, decay=0.0001, size=1, -# do_exploration=True, end_episode=450)) - -learn = AgentLearner(data, model)#, metrics=[EpsilonMetric]) - +data = MDPDataBunch.from_env('Pong-v0', render='human', max_steps=100, add_valid=False) +model = FixedTargetDQN(data, memory=ExperienceReplay(memory_size=100000, reduce_ram=True)) +learn = AgentLearner(data, model) learn.fit(450) \ No newline at end of file