forked from rogerio-silva/A2C-LoVQI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdqn_for_n_agents_cuda.py
267 lines (207 loc) · 8.99 KB
/
dqn_for_n_agents_cuda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# -*- coding: utf-8 -*-
"""DQN_for_N_agents.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1-XVG8aYRyimQWoy-CgifXG_fC_JecybJ
How to use this file:
This file implements the Deep QLearning for N drones in a predefined grid space.
For your use case, you need to:
- Change the NUMBER_ACTION_COLUMNS properly for your case (e.g for actions 0,...24, the value must be 25. For 0,...124, the value must be 125, and so on)
- Set the hyperparameters properly inside Map \_init\_() class.
- Change any other values in order to customize the process, e.g number of episodes and steps.
"""
import pandas as pd
import numpy as np
import random
import ast
from itertools import combinations
import matplotlib.pyplot as plt
import torch
from codetiming import Timer
execution_time = Timer(text="Execution time: {0:.2f} seconds")
execution_time.start()
if not torch.cuda.is_available():
raise Exception('CUDA is not available. Aborting.')
d = pd.read_csv('./data/reward_2.dat', sep=' ')
# the action columns (0,1,... ,24) are in string mode.
# Below, we change it to int mode, to make future manipulations easy.
d = d.rename({col: int(col) for col in d.columns if d[col].dtype == 'object'}, axis='columns')
NUMBER_ACTION_COLUMNS = 25 # total number of possible actions
# the table's body are in string mode (e.g. '[945,0.95379]'). Turning it into lists to make future manipulations easy.
for col in range(NUMBER_ACTION_COLUMNS): # from 0 to 24 (for 2 agents and 5 possible actions each)
d[col] = d[col].apply(ast.literal_eval)
class Map:
def __init__(self,
dim_grid=10, # means the grid is 10x10
actions_per_agent=5, # each agent is capable of up,right,down,left and stopped movements
agents=2, # total number of agents in the grid
state=0, # initial state, starts at state 0 (means there is a first position for all agents)
alpha=0.2, # Q-learning algorithm learning rate
gamma=0.9,
# gamma is the discount factor. It is multiplied by the estimation of the optimal future value.
epsilon=1,
# epsilon handles the exploration/exploitation trade-off (e.g. epsilon < 0.4 means 40% exploration
# and 60% exploitation)
epsilon_min=0.5,
# minimum allowed epsilon. Epsilon will change (reduce) with decay_epsilon function. At begginings,
# it means more exploration than exploitation.
epsilon_decay=0.999,
# epsilon will decay at each step. For 1000 steps and decay 0.999, for example, epsilon will decay a
# factor by 0.367 of it initial value.
states=4950
):
self.dim_grid = dim_grid
self.actions_per_agent = actions_per_agent
self.agents = agents
self.state = state
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_min = epsilon_min
self.epsilon_max = epsilon
self.epsilon_decay = epsilon_decay
self.states = states
self.init()
# for a grid 10x10 and 2 agents, for example, total stated are C(10x10,2) = 4950 states
def init(self):
self.states = len(list(combinations([i for i in range(self.dim_grid * self.dim_grid)], self.agents)))
# gives the next state given the current state for a given action
def next_state(self, current_state, act):
return d.loc[current_state, act][0]
# gives the current qos for a current state
def current_qos(self, current_state):
return d.loc[current_state, 'qos']
# gives the qos of the next state, given the current state and a given action
def next_qos(self, current_state, action):
return d.loc[current_state, action][1]
# epsilon will return to it's initial value for each episode
def resetEpsilon(self):
self.epsilon = self.epsilon_max
# attribute a new value to epsilon after a decay
def decay_epsilon(self):
self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
# if current qos is less than the next qos, we have a reward. Otherwise, we have a penalty.
def reward(self, current_state, action):
return self.next_qos(current_state, action)
def actionResults(self, state, action):
newstate = self.next_state(state, action)
reward = self.reward(state, action)
return newstate, reward
m = Map()
# model for 2 agents in a 10x10 grid
# l1 = 4950
possible_states = m.states
possible_actions = m.actions_per_agent ** m.agents
l1 = possible_states
l2 = 150
l3 = 100
l4 = possible_actions
model = torch.nn.Sequential(
torch.nn.Linear(l1, l2),
torch.nn.ReLU(),
torch.nn.Linear(l2, l3),
torch.nn.ReLU(),
torch.nn.Linear(l3, l4)
).cuda()
loss_fn = torch.nn.MSELoss().cuda()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
rewards = [0]
k = 0
improvements = []
episodes = 2000
steps = 300
losses = [0]
losses_episode = [0]
episodes_visited_states = [[] for _ in range(episodes)]
for ep in range(episodes):
random_state = random.choice(d['state'])
# state is [0,0,...,state,0,0,...,0]
state = np.zeros((1, possible_states))
state[0, random_state] = 1
state_ = torch.from_numpy(state).float().cuda()
start_qos_episode = m.current_qos(torch.argmax(state_[0]).item())
# print(f'Qos start state (episode {ep}): {start_qos_episode}')
# reestablish epsilon to it's maximum value for successive decays
m.resetEpsilon()
mean_reward = 0
mean_loss_episode = 0
state2 = 0
episode_visited_states = []
for step in range(steps):
model.train()
qval = model(state_) # predicted qvalue for state (torch format)
qval_ = qval.cpu().data.numpy() # predicted qvalue in numpy format
# random action (random < epsilon) or best action (random > epsilon) - exploration vs exploitation
m.epsilon = max(m.epsilon * m.epsilon_decay, m.epsilon_min)
if random.random() < m.epsilon:
action = np.random.randint(0, possible_actions)
else:
action = np.argmax(qval_)
newstate, reward = m.actionResults(torch.argmax(state_[0]).item(), action)
# newstate is [0,0,...,newstate,0,0,...,0]
state2 = np.zeros((1, possible_states))
state2[0, newstate] = 1
state2_ = torch.from_numpy(state2).float().cuda()
mean_reward = ((k + 1) * rewards[-1] + reward) / (k + 2)
k = k + 1
with torch.inference_mode():
newQ = model(state2_)
maxQ = torch.max(newQ).cuda()
Y = reward + (m.gamma * maxQ)
# Y = torch.Tensor([Y]).detach().cuda() # target value
Y_pred = qval.squeeze()[action] # predicted
loss = loss_fn(Y_pred, Y)
losses.append(loss.item())
mean_loss_episode = ((step + 1) * losses_episode[-1] + loss.item()) / (step + 2)
optimizer.zero_grad()
loss.backward()
optimizer.step()
episode_visited_states.append(state)
state = state2
state_ = state2_
final_qos_episode = m.current_qos(torch.argmax(state2_[0]).item())
# print(f'Qos final state (episode {ep}): {final_qos_episode}')
if final_qos_episode > start_qos_episode:
# print(True)
improvements.append(1)
else:
# print(False)
improvements.append(0)
# print('---------'*5)
# Média móvel da recompensa no término de cada episodio
rewards.append(mean_reward)
losses_episode.append(mean_loss_episode)
# clear_output(wait=True)
# print(f"episode: {ep:0{5}}/{episodes} - R: {mean_reward:.{8}f} - loss: {mean_loss_episode}")
episodes_visited_states[ep] = episode_visited_states
execution_time.stop()
print(
f'For {episodes} episodes, there was {sum(improvements)} improvements '
f'({round(sum(improvements) * 100 / episodes, 2)}%) and '
f'{episodes - sum(improvements)} worse results ('
f'{round((episodes - sum(improvements)) * 100 / episodes, 2)}%).')
############################################################################################################
plt.figure(figsize=(6, 3))
plt.plot(losses_episode)
plt.xlabel("Epochs", fontsize=16)
plt.ylabel("Loss_episode", fontsize=16)
plt.plot()
fig, ax = plt.subplots(figsize=(6, 3))
ax.plot(np.arange(len(rewards)), rewards, linestyle='solid', color='blue', linewidth=2)
ax.set_title(f'Agents: {m.agents} , Grid: {m.dim_grid}x{m.dim_grid}, Movements per Agent: {m.actions_per_agent}')
ax.set_xlabel("Episodes")
ax.set_ylabel("Avg Reward")
plt.show()
# lsts = []
# # creates a unique list with all visited states for all episodes and steps (for histogram analysis)
# for e in episodes_visited_states:
# for v in e:
# lsts.extend(v)
# fig, ax = plt.subplots(figsize=(16, 9))
# ax.set_title(f'Visited States')
# ax.set_xlabel("States")
# ax.set_ylabel("Frequency")
# plt.hist(lsts, bins=m.states)
# plt.ylim(0, m.states)
# plt.show()