Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

controller: Proactively disconnect node based on heartbeat #911

Merged
merged 3 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/agent/agent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#ControllerAddress=

#
# Defines the interval between two heartbeat signals sent to bluechi in milliseconds.
# Defines the interval between two heartbeat signals sent to bluechi in milliseconds. A value of 0 disables it.
#HeartbeatInterval=2000

#
Expand Down
10 changes: 10 additions & 0 deletions config/controller/controller.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@
# The port the controller listens on to establish connections with the bluechi agents.
#ControllerPort=842

#
# Defines the interval for which the bluechi-controller periodically checks the connectivity of all nodes based on
# the received heartbeat signals sent to bluechi-agent to bluechi-controller, in milliseconds.
#HeartbeatInterval=0

#
# Defines the threshold to actively disconnect nodes based on the time difference from the last received heartbeat
# signal from the respective bluechi-agent. In milliseconds.
#NodeHeartbeatThreshold=6000

#
# The level used for logging. Supported values are: DEBUG, INFO, WARN and ERROR.
#LogLevel=INFO
Expand Down
8 changes: 8 additions & 0 deletions doc/man/bluechi-controller.conf.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ A comma separated list of unique bluechi-agent names. It's mandatory to set the
in the list can connect to `bluechi-controller'. These names are defined in the agent's configuration file under `NodeName`
option (see `bluechi-agent.conf(5)`).

#### **HeartbeatInterval** (long)

The interval to periodically check node's connectivity based on heartbeat signals sent to bluechi, in milliseconds. A value of 0 disables it.

#### **NodeHeartbeatThreshold** (long)

The threshold in milliseconds to determine whether a node is disconnected. If the node's last heartbeat signal was received before this threshold, bluechi assumes that the node is down or the connection was cut off and performs a disconnect.

#### **LogLevel** (string)

The level used for logging. Supported values are:
Expand Down
3 changes: 2 additions & 1 deletion src/agent/agent.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ static int agent_setup_heartbeat_timer(Agent *agent) {
assert(agent);

if (agent->heartbeat_interval_msec <= 0) {
bc_log_warnf("Heartbeat disabled since interval is '%d', ", agent->heartbeat_interval_msec);
bc_log_warnf("Heartbeat disabled since configured interval '%d' is <=0",
agent->heartbeat_interval_msec);
return 0;
}

Expand Down
4 changes: 2 additions & 2 deletions src/client/method-status.c
Original file line number Diff line number Diff line change
Expand Up @@ -488,8 +488,8 @@ static char *node_connection_fmt_last_seen(NodeConnection *con) {
}

struct timespec t;
t.tv_sec = (time_t) con->last_seen;
t.tv_nsec = 0;
t.tv_sec = (time_t) con->last_seen / USEC_PER_SEC;
t.tv_nsec = (long) (con->last_seen % USEC_PER_SEC) * NSEC_PER_USEC;
return get_formatted_log_timestamp_for_timespec(t, false);
}

Expand Down
123 changes: 123 additions & 0 deletions src/controller/controller.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "libbluechi/bus/utils.h"
#include "libbluechi/common/cfg.h"
#include "libbluechi/common/common.h"
#include "libbluechi/common/event-util.h"
#include "libbluechi/common/parse-util.h"
#include "libbluechi/common/time-util.h"
#include "libbluechi/log/log.h"
Expand Down Expand Up @@ -276,6 +277,28 @@ bool controller_set_port(Controller *controller, const char *port_s) {
return true;
}

bool controller_set_heartbeat_interval(Controller *controller, const char *interval_msec) {
long interval = 0;

if (!parse_long(interval_msec, &interval)) {
bc_log_errorf("Invalid heartbeat interval format '%s'", interval_msec);
return false;
}
controller->heartbeat_interval_msec = interval;
return true;
}

bool controller_set_heartbeat_threshold(Controller *controller, const char *threshold_msec) {
long threshold = 0;

if (!parse_long(threshold_msec, &threshold)) {
bc_log_errorf("Invalid heartbeat threshold format '%s'", threshold_msec);
return false;
}
controller->heartbeat_threshold_msec = threshold;
return true;
}

bool controller_parse_config(Controller *controller, const char *configfile) {
int result = 0;

Expand Down Expand Up @@ -330,6 +353,20 @@ bool controller_apply_config(Controller *controller) {
}
}

const char *interval_msec = cfg_get_value(controller->config, CFG_HEARTBEAT_INTERVAL);
if (interval_msec) {
if (!controller_set_heartbeat_interval(controller, interval_msec)) {
return false;
}
}

const char *threshold_msec = cfg_get_value(controller->config, CFG_HEARTBEAT_THRESHOLD);
if (threshold_msec) {
if (!controller_set_heartbeat_threshold(controller, threshold_msec)) {
return false;
}
}

/* Set socket options used for peer connections with the agents */
const char *keepidle = cfg_get_value(controller->config, CFG_TCP_KEEPALIVE_TIME);
if (keepidle) {
Expand Down Expand Up @@ -444,6 +481,86 @@ static bool controller_setup_node_connection_handler(Controller *controller) {
return true;
}

static int controller_reset_heartbeat_timer(Controller *controller, sd_event_source **event_source);

static int controller_heartbeat_timer_callback(
sd_event_source *event_source, UNUSED uint64_t usec, void *userdata) {
Controller *controller = (Controller *) userdata;
Node *node = NULL;
int r = 0;

LIST_FOREACH(nodes, node, controller->nodes) {
uint64_t diff = 0;
uint64_t now = 0;

if (!node_is_online(node)) {
continue;
}

now = get_time_micros();
if (now == 0) {
bc_log_error("Failed to get the time");
continue;
}

if (now < node->last_seen) {
bc_log_error("Clock skew detected");
continue;
}

diff = now - node->last_seen;
if (diff > (uint64_t) controller->heartbeat_threshold_msec * USEC_PER_MSEC) {
bc_log_infof("Did not receive heartbeat from node '%s' since '%d'ms. Disconnecting it...",
node->name,
controller->heartbeat_threshold_msec);
node_disconnect(node);
engelmi marked this conversation as resolved.
Show resolved Hide resolved
}
}

r = controller_reset_heartbeat_timer(controller, &event_source);
if (r < 0) {
bc_log_errorf("Failed to reset controller heartbeat timer: %s", strerror(-r));
return r;
}

return 0;
}

static int controller_reset_heartbeat_timer(Controller *controller, sd_event_source **event_source) {
return event_reset_time_relative(
controller->event,
event_source,
CLOCK_BOOTTIME,
controller->heartbeat_interval_msec * USEC_PER_MSEC,
0,
controller_heartbeat_timer_callback,
controller,
0,
"controller-heartbeat-timer-source",
false);
}

static int controller_setup_heartbeat_timer(Controller *controller) {
_cleanup_(sd_event_source_unrefp) sd_event_source *event_source = NULL;
int r = 0;

assert(controller);

if (controller->heartbeat_interval_msec <= 0) {
bc_log_warnf("Heartbeat disabled since configured interval '%d' is <=0",
controller->heartbeat_interval_msec);
return 0;
}

r = controller_reset_heartbeat_timer(controller, &event_source);
if (r < 0) {
bc_log_errorf("Failed to reset controller heartbeat timer: %s", strerror(-r));
return r;
}

return sd_event_source_set_floating(event_source, true);
}

/************************************************************************
************** org.eclipse.bluechi.Controller.ListUnits *****
************************************************************************/
Expand Down Expand Up @@ -1082,6 +1199,12 @@ bool controller_start(Controller *controller) {
return false;
}

r = controller_setup_heartbeat_timer(controller);
if (r < 0) {
bc_log_errorf("Failed to set up controller heartbeat timer: %s", strerror(-r));
return false;
}

ShutdownHook hook;
hook.shutdown = (ShutdownHookFn) controller_stop;
hook.userdata = controller;
Expand Down
2 changes: 2 additions & 0 deletions src/controller/controller.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ struct Controller {

uint16_t port;
char *api_bus_service_name;
long heartbeat_interval_msec;
long heartbeat_threshold_msec;

sd_event *event;
sd_event_source *node_connection_source;
Expand Down
21 changes: 15 additions & 6 deletions src/controller/node.c
Original file line number Diff line number Diff line change
Expand Up @@ -520,11 +520,13 @@ static int node_match_job_done(UNUSED sd_bus_message *m, UNUSED void *userdata,
static int node_match_heartbeat(UNUSED sd_bus_message *m, void *userdata, UNUSED sd_bus_error *error) {
Node *node = userdata;

int r = get_time_seconds(&node->last_seen);
if (r < 0) {
bc_log_errorf("Failed to get current time on heartbeat: %s", strerror(-r));
uint64_t now = get_time_micros();
if (now == 0) {
bc_log_error("Failed to get current time on heartbeat");
return 0;
}

node->last_seen = now;
return 1;
}

Expand Down Expand Up @@ -787,7 +789,7 @@ bool node_set_agent_bus(Node *node, sd_bus *bus) {
bc_log_errorf("Failed to emit status property changed: %s", strerror(-r));
}

sd_bus_match_signal(
r = sd_bus_match_signal(
bus,
NULL,
NULL,
Expand Down Expand Up @@ -887,6 +889,8 @@ static int node_method_register(sd_bus_message *m, void *userdata, UNUSED sd_bus
m, SD_BUS_ERROR_ADDRESS_IN_USE, "The node is already connected");
}

named_node->last_seen = get_time_micros();

r = asprintf(&description, "node-%s", name);
if (r >= 0) {
(void) sd_bus_set_description(node->agent_bus, description);
Expand Down Expand Up @@ -915,6 +919,13 @@ static int node_method_register(sd_bus_message *m, void *userdata, UNUSED sd_bus

static int node_disconnected(UNUSED sd_bus_message *message, void *userdata, UNUSED sd_bus_error *error) {
Node *node = userdata;

node_disconnect(node);

return 0;
}

void node_disconnect(Node *node) {
Controller *controller = node->controller;
void *item = NULL;
size_t i = 0;
Expand Down Expand Up @@ -1009,8 +1020,6 @@ static int node_disconnected(UNUSED sd_bus_message *message, void *userdata, UNU
/* update number of online nodes and check the new system state */
controller_check_system_status(controller, controller->number_of_nodes_online--);
}

return 0;
}

const char *node_get_status(Node *node) {
Expand Down
3 changes: 2 additions & 1 deletion src/controller/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ struct Node {
LIST_HEAD(ProxyDependency, proxy_dependencies);

struct hashmap *unit_subscriptions;
time_t last_seen;
uint64_t last_seen;

bool is_shutdown;
};
Expand All @@ -71,6 +71,7 @@ Node *node_new(Controller *controller, const char *name);
Node *node_ref(Node *node);
void node_unref(Node *node);
void node_shutdown(Node *node);
void node_disconnect(Node *node);

const char *node_get_status(Node *node);

Expand Down
13 changes: 13 additions & 0 deletions src/libbluechi/common/cfg.c
Original file line number Diff line number Diff line change
Expand Up @@ -507,5 +507,18 @@ int cfg_controller_def_conf(struct config *config) {
return result;
}

if ((result = cfg_set_value(
config, CFG_HEARTBEAT_INTERVAL, CONTROLLER_DEFAULT_HEARTBEAT_INTERVAL_MSEC)) !=
0) {
return result;
}

if ((result = cfg_set_value(
config,
CFG_HEARTBEAT_THRESHOLD,
CONTROLLER_DEFAULT_NODE_HEARTBEAT_THRESHOLD_MSEC)) != 0) {
return result;
}

return 0;
}
1 change: 1 addition & 0 deletions src/libbluechi/common/cfg.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define CFG_NODE_NAME "NodeName"
#define CFG_ALLOWED_NODE_NAMES "AllowedNodeNames"
#define CFG_HEARTBEAT_INTERVAL "HeartbeatInterval"
#define CFG_HEARTBEAT_THRESHOLD "NodeHeartbeatThreshold"
#define CFG_IP_RECEIVE_ERRORS "IPReceiveErrors"
#define CFG_TCP_KEEPALIVE_TIME "TCPKeepAliveTime"
#define CFG_TCP_KEEPALIVE_INTERVAL "TCPKeepAliveInterval"
Expand Down
3 changes: 3 additions & 0 deletions src/libbluechi/common/protocol.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
/* Time constants */
#define USEC_PER_SEC 1000000
#define USEC_PER_MSEC 1000
#define NSEC_PER_USEC 1000

/* Configuration defaults */
#define BC_DEFAULT_PORT "842"
Expand All @@ -26,6 +27,8 @@
#define BC_DEFAULT_DBUS_TIMEOUT ((uint64_t) (USEC_PER_SEC * 30))
/* Application-level heartbeat interval */
#define AGENT_DEFAULT_HEARTBEAT_INTERVAL_MSEC "2000"
#define CONTROLLER_DEFAULT_HEARTBEAT_INTERVAL_MSEC "0"
#define CONTROLLER_DEFAULT_NODE_HEARTBEAT_THRESHOLD_MSEC "6000"


/* BlueChi DBus service names */
Expand Down
6 changes: 6 additions & 0 deletions tests/bluechi_test/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def __init__(
name: str = "bluechi-controller",
port: str = "8420",
allowed_node_names: List[str] = [],
heartbeat_interval: str = "0",
node_heartbeat_threshold: str = "6000",
log_level: str = "DEBUG",
log_target: str = "journald",
log_is_quiet: bool = False,
Expand All @@ -41,6 +43,8 @@ def __init__(
self.name = name
self.port = port
self.allowed_node_names = allowed_node_names
self.heartbeat_interval = heartbeat_interval
self.node_heartbeat_threshold = node_heartbeat_threshold
self.log_level = log_level
self.log_target = log_target
self.log_is_quiet = log_is_quiet
Expand All @@ -53,6 +57,8 @@ def serialize(self) -> str:
return f"""[bluechi-controller]
ControllerPort={self.port}
AllowedNodeNames={allowed_node_names}
HeartbeatInterval={self.heartbeat_interval}
NodeHeartbeatThreshold={self.node_heartbeat_threshold}
LogLevel={self.log_level}
LogTarget={self.log_target}
LogIsQuiet={self.log_is_quiet}
Expand Down
3 changes: 3 additions & 0 deletions tests/tests/tier0/bluechi-heartbeat-disabled/main.fmf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
summary: Test if default configuration of controller disables periodic heartbeat of
the controller
id: a04517f6-8be1-468b-8dc0-f9674690f73f
Loading