dqn_for_n_agents_cuda.py

# -*- coding: utf-8 -*-
"""DQN_for_N_agents.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1-XVG8aYRyimQWoy-CgifXG_fC_JecybJ

How to use this file:

This file implements the Deep QLearning for N drones in a predefined grid space.

For your use case, you need to:

- Change the NUMBER_ACTION_COLUMNS properly for your case (e.g for actions 0,...24, the value must be 25. For 0,...124, the value must be 125, and so on)
- Set the hyperparameters properly inside Map \_init\_() class.
- Change any other values in order to customize the process, e.g number of episodes and steps.
"""

import pandas as pd
import numpy as np
import random
import ast
from itertools import combinations
import matplotlib.pyplot as plt
import torch
from codetiming import Timer


execution_time = Timer(text="Execution time: {0:.2f} seconds")
execution_time.start()

if not torch.cuda.is_available():
    raise Exception('CUDA is not available. Aborting.')

d = pd.read_csv('./data/reward_2.dat', sep=' ')

# the action columns (0,1,... ,24) are in string mode.
# Below, we change it to int mode, to make future manipulations easy.
d = d.rename({col: int(col) for col in d.columns if d[col].dtype == 'object'}, axis='columns')

NUMBER_ACTION_COLUMNS = 25  # total number of possible actions

# the table's body are in string mode (e.g. '[945,0.95379]'). Turning it into lists to make future manipulations easy.
for col in range(NUMBER_ACTION_COLUMNS):  # from 0 to 24 (for 2 agents and 5 possible actions each)
    d[col] = d[col].apply(ast.literal_eval)


class Map:
    def __init__(self,
                 dim_grid=10,  # means the grid is 10x10
                 actions_per_agent=5,  # each agent is capable of up,right,down,left and stopped movements
                 agents=2,  # total number of agents in the grid
                 state=0,  # initial state, starts at state 0 (means there is a first position for all agents)
                 alpha=0.2,  # Q-learning algorithm learning rate
                 gamma=0.9,
                 # gamma is the discount factor. It is multiplied by the estimation of the optimal future value.
                 epsilon=1,
                 # epsilon handles the exploration/exploitation trade-off (e.g. epsilon < 0.4 means 40% exploration
                 # and 60% exploitation)
                 epsilon_min=0.5,
                 # minimum allowed epsilon. Epsilon will change (reduce) with decay_epsilon function. At begginings,
                 # it means more exploration than exploitation.
                 epsilon_decay=0.999,
                 # epsilon will decay at each step. For 1000 steps and decay 0.999, for example, epsilon will decay a
                 # factor by 0.367 of it initial value.
                 states=4950
                 ):
        self.dim_grid = dim_grid
        self.actions_per_agent = actions_per_agent
        self.agents = agents
        self.state = state
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_max = epsilon
        self.epsilon_decay = epsilon_decay
        self.states = states
        self.init()

    # for a grid 10x10 and 2 agents, for example, total stated are C(10x10,2) = 4950 states
    def init(self):
        self.states = len(list(combinations([i for i in range(self.dim_grid * self.dim_grid)], self.agents)))

    # gives the next state given the current state for a given action
    def next_state(self, current_state, act):
        return d.loc[current_state, act][0]

    # gives the current qos for a current state
    def current_qos(self, current_state):
        return d.loc[current_state, 'qos']

    # gives the qos of the next state, given the current state and a given action
    def next_qos(self, current_state, action):
        return d.loc[current_state, action][1]

    # epsilon will return to it's initial value for each episode
    def resetEpsilon(self):
        self.epsilon = self.epsilon_max

    # attribute a new value to epsilon after a decay
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

    # if current qos is less than the next qos, we have a reward. Otherwise, we have a penalty.
    def reward(self, current_state, action):
        return self.next_qos(current_state, action)

    def actionResults(self, state, action):
        newstate = self.next_state(state, action)
        reward = self.reward(state, action)
        return newstate, reward


m = Map()

# model for 2 agents in a 10x10 grid
# l1 = 4950
possible_states = m.states
possible_actions = m.actions_per_agent ** m.agents

l1 = possible_states
l2 = 150
l3 = 100
l4 = possible_actions

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3, l4)
).cuda()

loss_fn = torch.nn.MSELoss().cuda()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

rewards = [0]
k = 0

improvements = []

episodes = 2000
steps = 300

losses = [0]
losses_episode = [0]
episodes_visited_states = [[] for _ in range(episodes)]

for ep in range(episodes):

    random_state = random.choice(d['state'])

    # state is [0,0,...,state,0,0,...,0]
    state = np.zeros((1, possible_states))
    state[0, random_state] = 1
    state_ = torch.from_numpy(state).float().cuda()

    start_qos_episode = m.current_qos(torch.argmax(state_[0]).item())
    # print(f'Qos start state (episode {ep}): {start_qos_episode}')

    # reestablish epsilon to it's maximum value for successive decays
    m.resetEpsilon()

    mean_reward = 0
    mean_loss_episode = 0
    state2 = 0
    episode_visited_states = []

    for step in range(steps):
        model.train()
        qval = model(state_)  # predicted qvalue for state (torch format)
        qval_ = qval.cpu().data.numpy()  # predicted qvalue in numpy format

        # random action (random < epsilon) or best action (random > epsilon) - exploration vs exploitation
        m.epsilon = max(m.epsilon * m.epsilon_decay, m.epsilon_min)
        if random.random() < m.epsilon:
            action = np.random.randint(0, possible_actions)
        else:
            action = np.argmax(qval_)

        newstate, reward = m.actionResults(torch.argmax(state_[0]).item(), action)

        # newstate is [0,0,...,newstate,0,0,...,0]
        state2 = np.zeros((1, possible_states))
        state2[0, newstate] = 1
        state2_ = torch.from_numpy(state2).float().cuda()

        mean_reward = ((k + 1) * rewards[-1] + reward) / (k + 2)
        k = k + 1

        with torch.inference_mode():
            newQ = model(state2_)

        maxQ = torch.max(newQ).cuda()
        Y = reward + (m.gamma * maxQ)

        # Y = torch.Tensor([Y]).detach().cuda()  # target value
        Y_pred = qval.squeeze()[action]  # predicted

        loss = loss_fn(Y_pred, Y)

        losses.append(loss.item())
        mean_loss_episode = ((step + 1) * losses_episode[-1] + loss.item()) / (step + 2)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        episode_visited_states.append(state)
        state = state2
        state_ = state2_

    final_qos_episode = m.current_qos(torch.argmax(state2_[0]).item())
    # print(f'Qos final state (episode {ep}): {final_qos_episode}')

    if final_qos_episode > start_qos_episode:
        # print(True)
        improvements.append(1)
    else:
        # print(False)
        improvements.append(0)

    # print('---------'*5)

    # Média móvel da recompensa no término de cada episodio
    rewards.append(mean_reward)
    losses_episode.append(mean_loss_episode)
    # clear_output(wait=True)
    # print(f"episode: {ep:0{5}}/{episodes} - R: {mean_reward:.{8}f} - loss: {mean_loss_episode}")
    episodes_visited_states[ep] = episode_visited_states

execution_time.stop()
print(
    f'For {episodes} episodes, there was {sum(improvements)} improvements '
    f'({round(sum(improvements) * 100 / episodes, 2)}%) and '
    f'{episodes - sum(improvements)} worse results ('
    f'{round((episodes - sum(improvements)) * 100 / episodes, 2)}%).')


############################################################################################################
plt.figure(figsize=(6, 3))
plt.plot(losses_episode)
plt.xlabel("Epochs", fontsize=16)
plt.ylabel("Loss_episode", fontsize=16)
plt.plot()

fig, ax = plt.subplots(figsize=(6, 3))
ax.plot(np.arange(len(rewards)), rewards, linestyle='solid', color='blue', linewidth=2)
ax.set_title(f'Agents: {m.agents} , Grid: {m.dim_grid}x{m.dim_grid}, Movements per Agent: {m.actions_per_agent}')
ax.set_xlabel("Episodes")
ax.set_ylabel("Avg Reward")
plt.show()

# lsts = []
# # creates a unique list with all visited states for all episodes and steps (for histogram analysis)
# for e in episodes_visited_states:
#     for v in e:
#         lsts.extend(v)
# fig, ax = plt.subplots(figsize=(16, 9))
# ax.set_title(f'Visited States')
# ax.set_xlabel("States")
# ax.set_ylabel("Frequency")
# plt.hist(lsts, bins=m.states)
# plt.ylim(0, m.states)
# plt.show()