-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
128 lines (104 loc) · 3.91 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import random
import numpy as np
import tensorflow as tf
import gym
env = gym.make('CartPole-v0')
state_space_size = 4
action_space_size = 1
n_actions = 2
def add_noise(sess, var_names, var_shapes, noise_std=1):
if noise_std == 0:
return
old_var = sess.run(var_names)
new_var = [
i + np.random.normal(0, noise_std, size=j)
for i, j
in zip(old_var, var_shapes)
]
for i, j in zip(var_names, new_var):
sess.run(i.assign(j))
return
def create_model():
input_ph = tf.placeholder(dtype=tf.float32, shape=[None, state_space_size])
output_ph = tf.placeholder(dtype=tf.float32, shape=[None, n_actions])
hidden_layer1 = tf.layers.dense(inputs=input_ph, units=164, activation=tf.nn.relu, name='hidden_layer1')
hidden_layer2 = tf.layers.dense(inputs=hidden_layer1, units=150, activation=tf.nn.relu, name='hidden_layer2')
output_pred = tf.layers.dense(inputs=hidden_layer2, units=n_actions, name='output_layer')
return input_ph, output_ph, output_pred
input_ph, output_ph, output_pred = create_model()
# create loss
mse = tf.reduce_mean(0.5 * tf.square(output_pred - output_ph))
# create optimizer
opt = tf.train.AdamOptimizer().minimize(mse)
# initialize variables
# --- sess.run(tf.global_variables_initializer())
# create saver to save model variables
# --- saver = tf.train.Saver()
with tf.Session() as sess:
rendering = False
epsilon = 0.5
noise_std = 0.5
gamma = 0.99
reward_sum = 0
total_episodes = 10000
sess.run(tf.global_variables_initializer())
var_names = tf.global_variables()
trainable_var_names = [x for x in var_names if x.trainable]
trainable_var_shapes = [i.shape for i in sess.run(var_names)]
for episode_number in range(total_episodes):
state = env.reset() # Initial state of the environment
done = False
episode_length = 0
while not done:
episode_length += 1
#if rendering:
# env.render()
# We are in state S
# Let's run our Q function on S to get Q values for all possible actions
qval = sess.run(
output_pred,
feed_dict={input_ph: state.reshape((1, state_space_size))}
)
if (random.random() < epsilon): # choose random action
action = np.random.randint(0, n_actions)
else: # choose best action from Q(s,a) values
action = (np.argmax(qval))
# Take action, observe new state S' and the reward
new_state, reward, done, info = env.step(action)
# Get max_Q(S',a)
newQ = sess.run(
output_pred,
feed_dict={input_ph: new_state.reshape(1, state_space_size)}
)
maxQ = np.max(newQ)
y = np.zeros((1, n_actions))
y[:] = qval[:]
if done: # terminal state
update = reward
else: # non-terminal state
update = (reward + (gamma * maxQ))
y[0][action] = update # target output
_, mse_run, = sess.run(
[opt, mse],
feed_dict={
input_ph: state.reshape(1, state_space_size),
output_ph: y
}
)
#add_noise(
# sess=sess,
# var_names=trainable_var_names
# var_shapes=trainable_var_shapes
# noise_std=noise_std/10
#)
state = new_state
print(f'end of episode: {episode_number}, lasts for {episode_length}, epsilon: {epsilon}, noise_std: {noise_std}')
if epsilon > 0.1:
epsilon -= (1 / 15)
noise_std = epsilon / 10
if episode_number > 50:
noise_std = 0
if episode_number > 150:
rendering = True
#if (episode_number+10) % 50:
# import pdb; pdb.set_trace()