Skip to content

Commit

Permalink
Added:
Browse files Browse the repository at this point in the history
 * Memory RAM size reduction via cleaning on item input.

Fixed:

 * DDPG is stable now. Works on Pendulum as expected / desired

Notes:

 * Now that DDPG works as expected, we will move to preparing repo for
 version 1.0. This will involve testing / CI and passing expected benchmarks.
  • Loading branch information
Josiah Laivins committed Oct 5, 2019
1 parent fea9a2e commit ed2d54f
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 51 deletions.
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,11 @@ could give discrete agents the ability to operate in a continuous domain via bin
- [X] 0.5.0 DDPG added. let us move
- [X] 0.5.0 The DDPG paper contains a visualization for Q learning might prove useful. Add to interpreter.

| ![](res/ddpg_balancing.gif) |
|:----:|
| *Fig 7: DDPG trains stably now..* |


Added q value interpretation per explanation by Lillicrap et al., 2016. Currently both models (DQN and DDPG) have
unstable q value approximations. Below is an example from DQN.
```python
Expand All @@ -184,22 +189,22 @@ a failing one will look globular or horizontal.

| ![](res/dqn_q_estimate_1.jpg) |
|:----:|
| *Fig 7: Initial Q Value Estimate. Seems globular which is expected for an initial model.* |
| *Fig 8: Initial Q Value Estimate. Seems globular which is expected for an initial model.* |

| ![](res/dqn_q_estimate_2.jpg) |
|:----:|
| *Fig 8: Seems like the DQN is not learning...* |
| *Fig 9: Seems like the DQN is not learning...* |

| ![](res/dqn_q_estimate_3.jpg) |
|:----:|
| *Fig 9: Alarming later epoch results. It seems that the DQN converges to predicting a single Q value.* |
| *Fig 10: Alarming later epoch results. It seems that the DQN converges to predicting a single Q value.* |

- [X] 0.6.0 Single Global fit function like Fastai's. Think about the missing batch step. Noted some of the changes to
the existing the Fastai

| ![](res/fit_func_out.jpg) |
|:----:|
| *Fig 10: Resulting output of a typical fit function using ref code below.* |
| *Fig 11: Resulting output of a typical fit function using ref code below.* |

```python
from fast_rl.agents.DQN import DuelingDQN
Expand Down
9 changes: 7 additions & 2 deletions fast_rl/agents/BaseAgent.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def pick_action(self, x):
if len(x.shape) > 2: raise ValueError('The agent is outputting actions with more than 1 dimension...')

action, x, perturbed = self.exploration_strategy.perturb(x, x, self.data.train_ds.env.action_space)
x = np.clip(x, -1.0, 1.0)

if isinstance(self.data.train_ds.env.action_space, Discrete) and not perturbed: action = x.argmax().numpy().item()
elif isinstance(self.data.train_ds.env.action_space, Box): action = x.squeeze(0).numpy()
Expand All @@ -72,7 +73,8 @@ def forward(self, x):
return x.view(x.size(0), -1)


def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=True, activation_fuction=None):
def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
activation_function=None, final_activation_function=None):
"""Generates an nn module.
Notes:
Expand All @@ -81,7 +83,7 @@ def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use
Returns:
"""
act = nn.LeakyReLU if activation_fuction is None else activation_fuction
act = nn.LeakyReLU if activation_function is None else activation_function
action_size = action_size[0] # For now the dimension of the action does not make a difference.
# For now keep drop out as 0, test including dropout later
ps = [0] * len(layer_list)
Expand All @@ -93,8 +95,11 @@ def create_nn_model(layer_list: list, action_size, state_size, use_bn=False, use
embedded, n_in = get_embedded(n_in[0], n_out, n_in[1], 5)
layers += [ToLong(), embedded, Flatten()]
elif i == 0: n_in = n_in[0]
if i == 0 and use_bn: layers += [nn.BatchNorm1d(n_in)]

layers += bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act)

if final_activation_function is not None: layers += [final_activation_function()]
return nn.Sequential(*layers)


Expand Down
71 changes: 46 additions & 25 deletions fast_rl/agents/DDPG.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from copy import deepcopy

import torch
from fastai.basic_train import LearnerCallback, Any, OptimWrapper, ifnone, F
import numpy as np
from fastai.metrics import RMSE
from torch import nn
from torch.nn import MSELoss
from torch.optim import Adam

Expand Down Expand Up @@ -30,7 +33,7 @@ def on_loss_begin(self, **kwargs: Any):
"""Performs memory updates, exploration updates, and model optimization."""
if self.learn.model.training:
self.learn.model.memory.update(item=self.learn.data.x.items[-1])
self.learn.model.exploration_strategy.update(self.episode, self.max_episodes,
self.learn.model.exploration_strategy.update(episode=self.episode, max_episodes=self.max_episodes,
do_exploration=self.learn.model.training)
post_optimize = self.learn.model.optimize()
if self.learn.model.training:
Expand All @@ -44,10 +47,31 @@ def on_loss_begin(self, **kwargs: Any):
# self.learn.model.target_copy_over()


class Critic(nn.Module):
def __init__(self, layer_list: list, action_size, state_size, use_bn=False, use_embed=True,
activation_function=None):
super().__init__()
self.action_size = action_size[0]
self.state_size = state_size[0]

self.fc1 = nn.Linear(self.state_size, layer_list[0])
self.fc2 = nn.Linear(layer_list[0] + self.action_size, layer_list[1])
self.fc3 = nn.Linear(layer_list[1], 1)

def forward(self, x):
action, x = x[:, self.state_size:], x[:, :self.state_size]

x = nn.LeakyReLU()(self.fc1(x))
x = nn.LeakyReLU()(self.fc2(torch.cat((x, action), 1)))
x = nn.LeakyReLU()(self.fc3(x))

return x


class DDPG(BaseAgent):

def __init__(self, data: MDPDataBunch, memory=None, tau=0.001, batch=64, discount=0.99,
lr=0.005, exploration_strategy=None, env_was_discrete=False):
def __init__(self, data: MDPDataBunch, memory=None, tau=1e-3, batch=64, discount=0.99,
lr=1e-3, actor_lr=1e-4, exploration_strategy=None, env_was_discrete=False):
"""
Implementation of a continuous control algorithm using an actor/critic architecture.
Expand All @@ -74,42 +98,45 @@ def __init__(self, data: MDPDataBunch, memory=None, tau=0.001, batch=64, discoun
self.lr = lr
self.discount = discount
self.batch = batch
self.tao = tau
self.tau = 1
self.memory = ifnone(memory, ExperienceReplay(10000))

self.action_model = self.initialize_action_model([30, 30], data)
self.critic_model = self.initialize_critic_model([30, 30], data)
self.action_model = self.initialize_action_model([400, 300], data)
self.critic_model = self.initialize_critic_model([400, 300], data)

self.opt = OptimWrapper.create(Adam, lr=lr, layer_groups=[self.action_model])
self.opt = OptimWrapper.create(Adam, lr=actor_lr, layer_groups=[self.action_model])
self.critic_optimizer = OptimWrapper.create(Adam, lr=lr, layer_groups=[self.critic_model])

self.t_action_model = self.initialize_action_model([30, 30], data)
self.t_critic_model = self.initialize_critic_model([30, 30], data)
self.t_action_model = deepcopy(self.action_model)
self.t_critic_model = deepcopy(self.critic_model)

self.target_copy_over()
self.tau = tau

self.learner_callbacks = [BaseDDPGCallback]

self.loss_func = F.smooth_l1_loss# MSELoss()
# TODO Move to Ornstein-Uhlenbeck process
self.loss_func = MSELoss()

self.exploration_strategy = ifnone(exploration_strategy, GreedyEpsilon(epsilon_start=1, epsilon_end=0.1,
decay=0.001,
do_exploration=self.training))

def initialize_action_model(self, layers, data):
return create_nn_model(layers, *data.get_action_state_size(), True, use_embed=data.train_ds.embeddable)
return create_nn_model(layers, *data.get_action_state_size(), False, use_embed=data.train_ds.embeddable,
final_activation_function=nn.Tanh)

def initialize_critic_model(self, layers, data):
""" Instead of state -> action, we are going state + action -> single expected reward. """
return create_nn_model(layers, (1, 0), (sum([_[0] for _ in data.get_action_state_size()]), 0), True,
use_embed=data.train_ds.embeddable)
return Critic(layers, *data.get_action_state_size())

def pick_action(self, x):
if self.training: self.action_model.eval()
with torch.no_grad():
action = super(DDPG, self).pick_action(x)
action, x = super(DDPG, self).pick_action(x)
if self.training: self.action_model.train()
return action

if not self.env_was_discrete: action = np.clip(action, -1, 1)
return action, np.clip(x, -1, 1)

def optimize(self):
"""
Expand Down Expand Up @@ -140,16 +167,12 @@ def optimize(self):

y_hat = self.critic_model(torch.cat((s, a), 1))

critic_loss = self.loss_func(y, y_hat)

print(f'{y[0][:15]}, {y_hat[0][:15]}')
critic_loss = self.loss_func(y_hat, y)

if self.training:
# Optimize critic network
self.critic_optimizer.zero_grad()
critic_loss.backward()
for param in self.critic_model.parameters():
param.grad.data.clamp_(-1, 1)
self.critic_optimizer.step()

actor_loss = -self.critic_model(torch.cat((s, self.action_model(s)), 1)).mean()
Expand All @@ -160,8 +183,6 @@ def optimize(self):
# Optimize actor network
self.opt.zero_grad()
actor_loss.backward()
for param in self.action_model.parameters():
param.grad.data.clamp_(-1, 1)
self.opt.step()

with torch.no_grad():
Expand All @@ -174,8 +195,8 @@ def forward(self, x):

def target_copy_over(self):
""" Soft target updates the actor and critic models.."""
self.soft_target_copy_over(self.t_action_model, self.action_model, self.tao)
self.soft_target_copy_over(self.t_critic_model, self.critic_model, self.tao)
self.soft_target_copy_over(self.t_action_model, self.action_model, self.tau)
self.soft_target_copy_over(self.t_critic_model, self.critic_model, self.tau)

def soft_target_copy_over(self, t_m, f_m, tau):
for target_param, local_param in zip(t_m.parameters(), f_m.parameters()):
Expand Down
12 changes: 7 additions & 5 deletions fast_rl/core/MarkovDecisionProcess.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def __init__(self, env: gym.Env, feed_type=FEED_TYPE_STATE, render='rgb_array',
self.env = env
# MDP specific values
self.actions = self.get_random_action(env.action_space)
self.raw_action = np.random.randn((env.action_space.n))
self.raw_action = np.random.randn((env.action_space.shape[0])) if isinstance(env.action_space, Box) else np.random.randn((env.action_space.n))

self.is_done = True
self.current_state = None
Expand Down Expand Up @@ -503,10 +503,12 @@ def __init__(self, state, state_prime, alt_state, action, reward, done, episode,
'alt_state': self.alternate_state, 'action': action, 'reward': reward, 'done': done,
'episode': episode, 'feed_type': feed_type, 'raw_action': raw_action}

def clean(self):
self.current_state = None
self.result_state = None
self.alternate_state = None
def clean(self, only_alt=False):
if not only_alt:
self.current_state, self.result_state = None, None
self.obj['state'], self.obj['state_prime'] = None, None

self.alternate_state, self.obj['alt_state'] = None, None

def __str__(self):
formatted = (
Expand Down
17 changes: 10 additions & 7 deletions fast_rl/core/agent_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def perturb(self, action, raw_action, action_space):
_ = raw_action
return action, raw_action

def update(self, episode, max_episodes, do_exploration, **kwargs):
def update(self, max_episodes, do_exploration, **kwargs):
self.do_exploration = do_exploration


Expand Down Expand Up @@ -73,7 +73,7 @@ def perturb(self, action, raw_action, action_space: gym.Space):
else:
return action, raw_action, False

def update(self, current_episode, end_episode=0, **kwargs):
def update(self, episode, end_episode=0, **kwargs):
super(GreedyEpsilon, self).update(**kwargs)
if self.do_exploration:
self.end_episode = end_episode
Expand All @@ -82,7 +82,7 @@ def update(self, current_episode, end_episode=0, **kwargs):
self.steps_done += 1


class OrnsteinUhlenbeck(ExplorationStrategy):
class OrnsteinUhlenbeck(GreedyEpsilon):
def __init__(self, size, mu=0., theta=0.15, sigma=0.2, **kwargs):
"""
Expand All @@ -108,11 +108,12 @@ def perturb(self, action, raw_action, action_space):
else: dx = np.zeros(self.x.shape)

self.x += dx
return action, torch.from_numpy(self.x).float() + raw_action, False
return action, self.epsilon * torch.from_numpy(self.x).float() + raw_action, False


class Experience:
def __init__(self, memory_size):
def __init__(self, memory_size, reduce_ram=False):
self.reduce_ram = reduce_ram
self.max_size = memory_size
self.callbacks = []

Expand All @@ -127,7 +128,7 @@ def refresh(self, **kwargs):


class ExperienceReplay(Experience):
def __init__(self, memory_size):
def __init__(self, memory_size, **kwargs):
"""
Basic store-er of state space transitions for training agents.
Expand All @@ -138,7 +139,7 @@ def __init__(self, memory_size):
Args:
memory_size (int): Max N samples to store
"""
super().__init__(memory_size)
super().__init__(memory_size, **kwargs)
self.max_size = memory_size
self.memory = deque(maxlen=memory_size) # type: List[MarkovDecisionProcessSlice]

Expand All @@ -150,6 +151,7 @@ def sample(self, batch, **kwargs):
return random.sample(self.memory, batch)

def update(self, item, **kwargs):
if self.reduce_ram: item.clean(True)
self.memory.append(copy.deepcopy(item))


Expand Down Expand Up @@ -218,6 +220,7 @@ def update(self, item, **kwargs):
"""
maximal_priority = self.alpha
if self.reduce_ram: item.clean(True)
self.memory.add(np.abs(maximal_priority) + self.epsilon, item)


Expand Down
18 changes: 10 additions & 8 deletions fast_rl/util/random_thingy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@
from fast_rl.core.MarkovDecisionProcess import MDPDataBunch

# data = MDPDataBunch.from_env('Pendulum-v0', render='human')
from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck
from fast_rl.core.agent_core import GreedyEpsilon, OrnsteinUhlenbeck, ExperienceReplay
from fast_rl.core.metrics import EpsilonMetric

data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False)
# data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False)
# data = MDPDataBunch.from_env('maze-random-5x5-v0', render='human', max_steps=1000, add_valid=False)
data = MDPDataBunch.from_env('Pendulum-v0', render='human', add_valid=False)
# data = MDPDataBunch.from_env('MountainCarContinuous-v0', render='human', add_valid=False)
model = DDPG(data, batch=128, lr=0.01, env_was_discrete=True,
exploration_strategy=OrnsteinUhlenbeck(4, do_exploration=True))
learn = AgentLearner(data, model)
learn.fit(40)
model = DDPG(data, batch=128, memory=ExperienceReplay(100000, reduce_ram=True),
exploration_strategy=OrnsteinUhlenbeck(epsilon_start=1, epsilon_end=0.1, decay=0.0001, size=1,
do_exploration=True, end_episode=450))
learn = AgentLearner(data, model, metrics=[EpsilonMetric])
learn.fit(4500)


from fast_rl.core.Interpreter import AgentInterpretationAlpha

interp = AgentInterpretationAlpha(learn, DatasetType.Train)
interp.plot_heatmapped_episode(-1)
interp.plot_q_density(-1)
Binary file added res/ddpg_balancing.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit ed2d54f

Please sign in to comment.