forked from kvfrans/parallel-trpo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
158 lines (132 loc) · 6.18 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import numpy as np
import tensorflow as tf
import gym
from utils import *
from model import *
import argparse
from rollouts import *
import json
parser = argparse.ArgumentParser(description='TRPO.')
# these parameters should stay the same
# parser.add_argument("--task", type=str, default='Reacher-v1')
parser.add_argument("--task", type=str, default='MountainCarContinuous-v0')
parser.add_argument("--timesteps_per_batch", type=int, default=1024)
parser.add_argument("--n_steps", type=int, default=100000)
parser.add_argument("--gamma", type=float, default=.99)
parser.add_argument("--max_kl", type=float, default=.001)
parser.add_argument("--cg_damping", type=float, default=1e-3)
parser.add_argument("--num_threads", type=int, default=5)
parser.add_argument("--monitor", type=bool, default=False)
# change these parameters for testing
parser.add_argument("--decay_method", type=str, default="adaptive") # adaptive, adaptive-margin, none
parser.add_argument("--timestep_adapt", type=int, default=0)
parser.add_argument("--kl_adapt", type=float, default=0)
args = parser.parse_args()
args.max_pathlength = gym.spec(args.task).timestep_limit
# Abstraction for parallelism inside of a single machine
# TODO: what's the intuition behind this learner_tasks
learner_tasks = multiprocessing.JoinableQueue()
learner_results = multiprocessing.Queue()
learner_env = gym.make(args.task)
# Learner corresponds with the TRPO algorithm
learner = TRPO(args, learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results)
learner.start()
# Create the parallel rollouts, in other words Actors that execute rollouts when
# each one of them gets called appropriately
rollouts = ParallelRollout(args)
learner_tasks.put(1)
learner_tasks.join()
starting_weights = learner_results.get()
# Get the same weights in all workers (Actors as named here)
rollouts.set_policy_weights(starting_weights)
start_time = time.time()
history = {}
history["rollout_time"] = []
history["learn_time"] = []
history["mean_reward"] = []
history["timesteps"] = []
history["maxkl"] = []
# start it off with a big negative number
last_reward = -1000000
recent_total_reward = 0
totalsteps = 0;
starting_timesteps = args.timesteps_per_batch
starting_kl = args.max_kl
summary_writer = tf.summary.FileWriter(
"/tmp/experiments/MountainCarContinuous-v0/parallel-TRPO",
graph=tf.get_default_graph()) # Create the writer for TensorBoard logs
iteration = 0
while True:
iteration += 1;
# runs a bunch of async processes that collect rollouts
rollout_start = time.time()
paths = rollouts.rollout()
rollout_time = (time.time() - rollout_start) / 60.0
# print("rollout = 0")
# Why is the learner in an async process?
# Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread
# and an async process creates another tf.Session, it will freeze up.
# To solve this, we just make the learner's tf.Session in its own async process,
# and wait until the learner's done before continuing the main thread.
learn_start = time.time()
learner_tasks.put((2,args.max_kl))
learner_tasks.put(paths)
learner_tasks.join()
new_policy_weights, mean_reward = learner_results.get()
learn_time = (time.time() - learn_start) / 60.0
print("-------- Iteration %d ----------" % iteration)
print("Total time: %.2f mins" % ((time.time() - start_time) / 60.0))
history["rollout_time"].append(rollout_time)
history["learn_time"].append(learn_time)
history["mean_reward"].append(mean_reward)
history["timesteps"].append(args.timesteps_per_batch)
history["maxkl"].append(args.max_kl)
recent_total_reward += mean_reward
if args.decay_method == "adaptive":
if iteration % 10 == 0:
if recent_total_reward < last_reward:
print("Policy is not improving. Decrease KL and increase steps.")
if args.timesteps_per_batch < 20000:
args.timesteps_per_batch += args.timestep_adapt
if args.max_kl > 0.001:
args.max_kl -= args.kl_adapt
else:
print("Policy is improving. Increase KL and decrease steps.")
if args.timesteps_per_batch > 1200:
args.timesteps_per_batch -= args.timestep_adapt
if args.max_kl < 0.01:
args.max_kl += args.kl_adapt
last_reward = recent_total_reward
recent_total_reward = 0
if args.decay_method == "adaptive-margin":
if iteration % 10 == 0:
scaled_last = last_reward + abs(last_reward * 0.05)
print("Last reward: %f Scaled: %f Recent: %f" % (last_reward, scaled_last, recent_total_reward))
if recent_total_reward < scaled_last:
print("Policy is not improving. Decrease KL and increase steps.")
if args.timesteps_per_batch < 10000:
args.timesteps_per_batch += args.timestep_adapt
if args.max_kl > 0.001:
args.max_kl -= args.kl_adapt
else:
print("Policy is improving. Increase KL and decrease steps.")
if args.timesteps_per_batch > 1200:
args.timesteps_per_batch -= args.timestep_adapt
if args.max_kl < 0.01:
args.max_kl += args.kl_adapt
last_reward = recent_total_reward
recent_total_reward = 0
# timesteps = sum([len(path["rewards"]) for path in paths])
summary = tf.Summary(value=[tf.Summary.Value(tag="iteration_reward_mean", simple_value = mean_reward)])
summary_writer.add_summary(summary, iteration)
summary_writer.flush()
print("Current steps is " + str(args.timesteps_per_batch) + " and KL is " + str(args.max_kl))
# if iteration % 100 == 0:
# with open("%s-%s-%f-%f-%f-%f" % (args.task, args.decay_method, starting_timesteps, starting_kl, args.timestep_adapt, args.kl_adapt), "w") as outfile:
# json.dump(history,outfile)
totalsteps += args.timesteps_per_batch
print("%d total steps have happened" % totalsteps)
if totalsteps > args.n_steps:
break
rollouts.set_policy_weights(new_policy_weights)
rollouts.end()