-
Notifications
You must be signed in to change notification settings - Fork 3
/
DeepQN.m
329 lines (278 loc) · 11.2 KB
/
DeepQN.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
%Initialize the environment:
clear all
close all
X=[20;25;pi/4];
%figure('Position',[0 0 631 600]) %size to get 489x489 image saved from fig
figure('Position',[0 0 316 300]) %size to get 245x245 image saved from fig
F = getframe;
[I,Map] = frame2im(F);
while(1) %forcing to get a image of 489x489
close all;
figure('Position',[0 0 316 300]);
F = getframe;
[I,Map] = frame2im(F);
if(size(I,1)==245 && size(I,2)==245)
break;
end
end
%figure('Position',[0 0 526 500])
%figure('Position',[0 0 316 300])
rectangle('Position',[0,0,50,50]);
rectangle('Position',[25,25,2,2],'EdgeColor','k');
rectangle('Position',[10,40,2,2],'Curvature',[1 1]);
rectangle('Position',[20,30,2,2],'Curvature',[1 1]);
rectangle('Position',[20,5,2,2],'Curvature',[1 1]);
rectangle('Position',[40,30,2,2],'Curvature',[1 1]);
%plot(X(1),X(2));
hold on
p = plot(X(1),X(2),'k*');
%p=plot(X(1),X(2),'r*');
hold off
%%
% Perform random actions to generate Get Data for Training the CNN first
% time, This is analogous to Initializing the CNN with random weights.
for i=1:10
action=choose_random_action_old();
% X=[10;10;pi/4]; %can be randomized later
X=[randi([4 46]);randi([4 46]);randi([1 6])*rand()]; %randomized state
X=stateUpdate_DQN(X,action);%timestep of 1 unit
reward(:,:,:,i)=get_reward_old(X,action);%reward must be a(1x1x4) output here, for 4 actions, (as required by CNN-MATLAB)
%rewards act as random targets!
set(p,'XData',X(1));
set(p,'YData',X(2));
%p.Xdata = X(1); %MAY HAVE TO USE THESE ones IN SOME MATLAB VERSION
%p.Ydata = X(2);
drawnow;
% rectangle('Position',[X(1)-1,X(2)-1,2,2]);
F = getframe;
[I,Map] = frame2im(F);
states(:,:,:,i)=rgb2gray(I);
% pause(0.1);
end
for i=1:50 %getting more DATA
action=choose_random_action_old();
X=[randi([4 46]);randi([4 46]);randi([1 6])*rand()]; %randomized state
X=stateUpdate_DQN(X,action);%timestep of 1 unit
reward2(:,:,:,i)=get_reward_old(X,action);%reward must be a(1x1x4) output here, for 4 actions, (as required by CNN-MATLAB)
%rewards act as random targets!
set(p,'XData',X(1));
set(p,'YData',X(2));
drawnow;
F = getframe;
[I,Map] = frame2im(F);
states2(:,:,:,i)=rgb2gray(I);
% pause(0.1);
end
%%
%Define the convolutional neural network architecture.
layers = [
%imageInputLayer([489 489 1],'Name', 'input')
imageInputLayer([245 245 1],'Name', 'input')
convolution2dLayer(8,8,'Padding',1,'Name', 'conv1')
batchNormalizationLayer('Name', 'Batch_N')
reluLayer('Name', 'relu1')
%maxPooling2dLayer(2,'Stride',2,'Name', 'maxPool1')
convolution2dLayer(4,4,'Padding',1,'Name', 'conv2')
batchNormalizationLayer('Name', 'Batch_N2')
reluLayer('Name', 'relu2')
%maxPooling2dLayer(2,'Stride',2,'Name', 'maxPool2')
convolution2dLayer(4,4,'Padding',1,'Name', 'conv3')
batchNormalizationLayer('Name', 'Batch_N3')
reluLayer('Name', 'relu3')
%fullyConnectedLayer(512,'Name', 'FullyC')% size is eual to number of actions
fullyConnectedLayer(4,'Name', 'FullyC2')% size is eual to number of actions
regressionLayer('Name','Output')];
options = trainingOptions('sgdm', ...
'MaxEpochs',1, ...
'InitialLearnRate',0.001, ...
'Verbose',false, ...
'Plots','training-progress');
net = trainNetwork(states,reward,layers,options);
layersTransfer = net.Layers(1:end);
net = trainNetwork(states,reward,layersTransfer,options);
%used previously generated random States,Rewarrd to train,
%thus the network is initialized and ready for DQN
%% TRAINING
options = trainingOptions('sgdm', ...
'MaxEpochs',1, ...
'InitialLearnRate',0.01, ...
'Verbose',false, ...
'Plots','none'); %to supress plot occuring at each step when CNN is trained
Max_episodes=20;
Max_steps=100;
gamma_learning=0.002; % for the target equation :target_st=reward_st+gamma*maxQ_new;
epsilon=0.8; %e-greedy algorithm factor t choose action
for ep=1:Max_episodes
%set intitial position
%X=[10;10;randi([1 6])*rand()]; %can be randomized later
if(ep<10)
X=[randi([4 46]);randi([4 46]);randi([1 6])*rand()];
else
X=[randi([4 46]);randi([4 46]);randi([1 6])*rand()];
end
set(p,'XData',X(1));
set(p,'YData',X(2));
drawnow;
Net_reward(ep)=0;
for st=1:Max_steps
%acquire current state image,
F = getframe;
[I,Map] = frame2im(F);
state(:,:,:,1)=rgb2gray(I);
%can use below created state_new variable instead of getting frame again,
%will se later!!!!
%do a full forward pass through CNN.
Q_st=predict(net,state(:,:,:,1));% acquire Q values for all acitons
Q_st(isnan(Q_st)) = 10; %Removeing NAN values
%state(:,:,:,2)=zeros(245,245);
%Q_st_extra=predict(net,state(:,:,:,2));%Targets forthe extra layer,
%adding extra layer of zeros to get 255*255*1*4 dim'n forCNN-MATLAB
%WHY ZERSOS/ONES!!,, THIS CAN CAUSE WHOLE LOT OF TROUBLE IN WEIGHTS
%BETTER DUPLICATE ABOVE LAYER ITSELF!
state(:,:,:,2)=rgb2gray(I);
Q_st_extra=predict(net,state(:,:,:,2));
Q_st_extra(isnan(Q_st_extra)) = 10;
% choose action to do
[~,act_st]=max(Q_st);%index of max Q value gives the desired action to do!
if rand()<(epsilon/(ep-epsilon*(ep))) %e-greedy algo, reducing epsilon
act_st = randi([1 4],1);
end
X=stateUpdate_DQN(X,act_st);%timestep of 1 unit
reward_st=get_reward_simple(X);%reward must be a single output here
%get new state
set(p,'XData',X(1));
set(p,'YData',X(2));
%p.Xdata = X(1); %MAY HAVE TO USE THESE ones IN SOME MATLAB VERSION
%p.Ydata = X(2);
drawnow;
F = getframe;
[I,Map] = frame2im(F);
state_new(:,:,:,1)=rgb2gray(I);
%do a full forward pass through CNN.
Q_st_new=predict(net,state_new);% acquire Q values for all acitons
Q_st_new(isnan(Q_st_new)) = 10;
[maxQ_new,~]=max(Q_st_new);%is the max possible Q_value for next state
%set Targetx for CNN
%for all actions, default target is the Q value predicted, so loss
%is zero(computed inside the CNN), hence no weights updated
target(:,:,1:4,1:2)=0; %extra 2nd layer embedded for getting dim'n as 1*1*4*2
target(:,:,1,1)= Q_st(1);target(:,:,2,1)= Q_st(2);
target(:,:,3,1)= Q_st(3);target(:,:,4,1)= Q_st(4);
target(:,:,1,2)= Q_st_extra(1);target(:,:,2,2)= Q_st_extra(2);
target(:,:,3,2)= Q_st_extra(3);target(:,:,4,2)= Q_st_extra(4);
%target for taken action:
if reward_st==1000||reward_st==-1000 %TERMiNAL STATE REACHED, To end episode
target_st=reward_st;
else %non terminal state, Target is given by the update term from BELLMAN equation
target_st=reward_st+gamma_learning*maxQ_new; %gamma set as parameter before
end
%At this state, the target value for the action performed is got by
%above
target(:,:,act_st,1)=target_st;
target(:,:,act_st,2)=target_st;
%TRAIN the Netwrok with above (state,target) pair
layersTransfer = net.Layers(1:end); %transferring all previous layers(with weights)
net = trainNetwork(state,target,layersTransfer,options);
% feed out number of steps
steps(ep)=st;
Net_reward(ep)=Net_reward(ep)+reward_st;
if(reward_st==1000 || reward_st==-1000)%terminal reached
break;
end
% disp(st);
end
show=['Episode:',num2str(ep),' Num Stpes:',num2str(steps(ep)),' Total Reward:',num2str(Net_reward(ep))];
disp(show);
end
%%
%evaluATION
X=[10;10;pi/4];
%figure('Position',[0 0 631 600]) %size to get 489x489 image saved from fig
figure('Position',[0 0 316 300]) %size to get 245x245 image saved from fig
F = getframe;
[I,Map] = frame2im(F);
while(1) %forcing to get a image of 489x489
close all;
figure('Position',[0 0 316 300]);
F = getframe;
[I,Map] = frame2im(F);
if(size(I,1)==245 && size(I,2)==245)
break;
end
end
%figure('Position',[0 0 526 500])
%figure('Position',[0 0 316 300])
rectangle('Position',[0,0,50,50]);
rectangle('Position',[25,25,2,2],'EdgeColor','k');
rectangle('Position',[10,40,2,2],'Curvature',[1 1]);
rectangle('Position',[20,30,2,2],'Curvature',[1 1]);
rectangle('Position',[20,5,2,2],'Curvature',[1 1]);
rectangle('Position',[40,30,2,2],'Curvature',[1 1]);
%plot(X(1),X(2));
hold on
p = plot(X(1),X(2),'k*');
%p=plot(X(1),X(2),'r*');
hold off
Max_episodes=10;
Max_steps=100;
gamma_learning=0.002; % for the target equation :target_st=reward_st+gamma*maxQ_new;
epsilon=0.4; %e-greedy algorithm factor t choose action
for ep=1:Max_episodes
predictionError=0;
%set intitial position
X=[randi([4 46]);randi([4 46]);randi([1 6])*rand()]; %randomized state
% X=[45;10;pi/4]; %can be randomized later
set(p,'XData',X(1));
set(p,'YData',X(2));
drawnow;
Net_reward_validation(ep)=0;
for st=1:Max_steps
%acquire current state image,
F = getframe;
[I,Map] = frame2im(F);
state(:,:,:,1)=rgb2gray(I);
%can use below created state_new variable instead of getting frame again,
%will se later!!!!
%do a full forward pass through CNN.
Q_st=predict(net,state(:,:,:,1));% acquire Q values for all acitons
% choose action to do
[~,act_st]=max(Q_st);%index of max Q value gives the desired action to do!
%
% if rand()<(epsilon/(1*ep)) %e-greedy algo, reducing epsilon,
% act_st = randi([1 4],1);
% end
X=stateUpdate_DQN(X,act_st);%timestep of 1 unit
reward_st=get_reward_simple(X);%reward must be a single output here
%get new state
set(p,'XData',X(1));
set(p,'YData',X(2));
%p.Xdata = X(1); %MAY HAVE TO USE THESE ones IN SOME MATLAB VERSION
%p.Ydata = X(2);
drawnow;
F = getframe;
[I,Map] = frame2im(F);
state_new(:,:,:,1)=rgb2gray(I);
%do a full forward pass through CNN.
Q_st_new=predict(net,state_new);% acquire Q values for all acitons
[maxQ_new,~]=max(Q_st_new);%is the max possible Q_value for next state
%target for taken action:
if reward_st==1000||reward_st==-1000 %TERMiNAL STATE REACHED, To end episode
target_st=reward_st;
else %non terminal state, Target is given by the update term from BELLMAN equation
target_st=reward_st+gamma_learning*maxQ_new; %gamma set as parameter before
end
%At this state, the target value for the action performed is got by
%above
predictionError = predictionError+target_st - Q_st(act_st);
% feed out number of steps
steps_validation(ep)=st;
Net_reward_validation(ep)=Net_reward_validation(ep)+reward_st;
if(reward_st==1000 || reward_st==-1000)%terminal reached
break;
end
% disp(st);
end
rmse(ep)=predictionError*predictionError;
show=['Validation: Episode:',num2str(ep),' Num Stpes:',num2str(steps_validation(ep)),' Total Reward:',num2str(Net_reward_validation(ep)),' RMSE: ',num2str(rmse(ep))];
disp(show);
end