diff --git a/config/agent/agent.conf b/config/agent/agent.conf index b98dbdad63..2d4166050b 100644 --- a/config/agent/agent.conf +++ b/config/agent/agent.conf @@ -26,7 +26,7 @@ #ControllerAddress= # -# Defines the interval between two heartbeat signals sent to bluechi in milliseconds. +# Defines the interval between two heartbeat signals sent to bluechi in milliseconds. A value of 0 disables it. #HeartbeatInterval=2000 # diff --git a/config/controller/controller.conf b/config/controller/controller.conf index 707151822a..5fb7d3ce3a 100644 --- a/config/controller/controller.conf +++ b/config/controller/controller.conf @@ -16,6 +16,16 @@ # The port the controller listens on to establish connections with the bluechi agents. #ControllerPort=842 +# +# Defines the interval for which the bluechi-controller periodically checks the connectivity of all nodes based on +# the received heartbeat signals sent to bluechi-agent to bluechi-controller, in milliseconds. +#HeartbeatInterval=0 + +# +# Defines the threshold to actively disconnect nodes based on the time difference from the last received heartbeat +# signal from the respective bluechi-agent. In milliseconds. +#NodeHeartbeatThreshold=6000 + # # The level used for logging. Supported values are: DEBUG, INFO, WARN and ERROR. #LogLevel=INFO diff --git a/doc/man/bluechi-controller.conf.5.md b/doc/man/bluechi-controller.conf.5.md index 71eb47fb34..0015900fbe 100644 --- a/doc/man/bluechi-controller.conf.5.md +++ b/doc/man/bluechi-controller.conf.5.md @@ -27,6 +27,14 @@ A comma separated list of unique bluechi-agent names. It's mandatory to set the in the list can connect to `bluechi-controller'. These names are defined in the agent's configuration file under `NodeName` option (see `bluechi-agent.conf(5)`). +#### **HeartbeatInterval** (long) + +The interval to periodically check node's connectivity based on heartbeat signals sent to bluechi, in milliseconds. A value of 0 disables it. + +#### **NodeHeartbeatThreshold** (long) + +The threshold in milliseconds to determine whether a node is disconnected. If the node's last heartbeat signal was received before this threshold, bluechi assumes that the node is down or the connection was cut off and performs a disconnect. + #### **LogLevel** (string) The level used for logging. Supported values are: diff --git a/src/agent/agent.c b/src/agent/agent.c index a30b7f896f..2d5711a166 100644 --- a/src/agent/agent.c +++ b/src/agent/agent.c @@ -187,7 +187,8 @@ static int agent_setup_heartbeat_timer(Agent *agent) { assert(agent); if (agent->heartbeat_interval_msec <= 0) { - bc_log_warnf("Heartbeat disabled since interval is '%d', ", agent->heartbeat_interval_msec); + bc_log_warnf("Heartbeat disabled since configured interval '%d' is <=0", + agent->heartbeat_interval_msec); return 0; } diff --git a/src/client/method-status.c b/src/client/method-status.c index 4d8279dcdb..dfb594978a 100644 --- a/src/client/method-status.c +++ b/src/client/method-status.c @@ -488,8 +488,8 @@ static char *node_connection_fmt_last_seen(NodeConnection *con) { } struct timespec t; - t.tv_sec = (time_t) con->last_seen; - t.tv_nsec = 0; + t.tv_sec = (time_t) con->last_seen / USEC_PER_SEC; + t.tv_nsec = (long) (con->last_seen % USEC_PER_SEC) * NSEC_PER_USEC; return get_formatted_log_timestamp_for_timespec(t, false); } diff --git a/src/controller/controller.c b/src/controller/controller.c index 155ff772a2..488790883e 100644 --- a/src/controller/controller.c +++ b/src/controller/controller.c @@ -10,6 +10,7 @@ #include "libbluechi/bus/utils.h" #include "libbluechi/common/cfg.h" #include "libbluechi/common/common.h" +#include "libbluechi/common/event-util.h" #include "libbluechi/common/parse-util.h" #include "libbluechi/common/time-util.h" #include "libbluechi/log/log.h" @@ -276,6 +277,28 @@ bool controller_set_port(Controller *controller, const char *port_s) { return true; } +bool controller_set_heartbeat_interval(Controller *controller, const char *interval_msec) { + long interval = 0; + + if (!parse_long(interval_msec, &interval)) { + bc_log_errorf("Invalid heartbeat interval format '%s'", interval_msec); + return false; + } + controller->heartbeat_interval_msec = interval; + return true; +} + +bool controller_set_heartbeat_threshold(Controller *controller, const char *threshold_msec) { + long threshold = 0; + + if (!parse_long(threshold_msec, &threshold)) { + bc_log_errorf("Invalid heartbeat threshold format '%s'", threshold_msec); + return false; + } + controller->heartbeat_threshold_msec = threshold; + return true; +} + bool controller_parse_config(Controller *controller, const char *configfile) { int result = 0; @@ -330,6 +353,20 @@ bool controller_apply_config(Controller *controller) { } } + const char *interval_msec = cfg_get_value(controller->config, CFG_HEARTBEAT_INTERVAL); + if (interval_msec) { + if (!controller_set_heartbeat_interval(controller, interval_msec)) { + return false; + } + } + + const char *threshold_msec = cfg_get_value(controller->config, CFG_HEARTBEAT_THRESHOLD); + if (threshold_msec) { + if (!controller_set_heartbeat_threshold(controller, threshold_msec)) { + return false; + } + } + /* Set socket options used for peer connections with the agents */ const char *keepidle = cfg_get_value(controller->config, CFG_TCP_KEEPALIVE_TIME); if (keepidle) { @@ -444,6 +481,86 @@ static bool controller_setup_node_connection_handler(Controller *controller) { return true; } +static int controller_reset_heartbeat_timer(Controller *controller, sd_event_source **event_source); + +static int controller_heartbeat_timer_callback( + sd_event_source *event_source, UNUSED uint64_t usec, void *userdata) { + Controller *controller = (Controller *) userdata; + Node *node = NULL; + int r = 0; + + LIST_FOREACH(nodes, node, controller->nodes) { + uint64_t diff = 0; + uint64_t now = 0; + + if (!node_is_online(node)) { + continue; + } + + now = get_time_micros(); + if (now == 0) { + bc_log_error("Failed to get the time"); + continue; + } + + if (now < node->last_seen) { + bc_log_error("Clock skew detected"); + continue; + } + + diff = now - node->last_seen; + if (diff > (uint64_t) controller->heartbeat_threshold_msec * USEC_PER_MSEC) { + bc_log_infof("Did not receive heartbeat from node '%s' since '%d'ms. Disconnecting it...", + node->name, + controller->heartbeat_threshold_msec); + node_disconnect(node); + } + } + + r = controller_reset_heartbeat_timer(controller, &event_source); + if (r < 0) { + bc_log_errorf("Failed to reset controller heartbeat timer: %s", strerror(-r)); + return r; + } + + return 0; +} + +static int controller_reset_heartbeat_timer(Controller *controller, sd_event_source **event_source) { + return event_reset_time_relative( + controller->event, + event_source, + CLOCK_BOOTTIME, + controller->heartbeat_interval_msec * USEC_PER_MSEC, + 0, + controller_heartbeat_timer_callback, + controller, + 0, + "controller-heartbeat-timer-source", + false); +} + +static int controller_setup_heartbeat_timer(Controller *controller) { + _cleanup_(sd_event_source_unrefp) sd_event_source *event_source = NULL; + int r = 0; + + assert(controller); + + if (controller->heartbeat_interval_msec <= 0) { + bc_log_warnf("Heartbeat disabled since configured interval '%d' is <=0", + controller->heartbeat_interval_msec); + return 0; + } + + r = controller_reset_heartbeat_timer(controller, &event_source); + if (r < 0) { + bc_log_errorf("Failed to reset controller heartbeat timer: %s", strerror(-r)); + return r; + } + + return sd_event_source_set_floating(event_source, true); +} + /************************************************************************ ************** org.eclipse.bluechi.Controller.ListUnits ***** ************************************************************************/ @@ -1082,6 +1199,12 @@ bool controller_start(Controller *controller) { return false; } + r = controller_setup_heartbeat_timer(controller); + if (r < 0) { + bc_log_errorf("Failed to set up controller heartbeat timer: %s", strerror(-r)); + return false; + } + ShutdownHook hook; hook.shutdown = (ShutdownHookFn) controller_stop; hook.userdata = controller; diff --git a/src/controller/controller.h b/src/controller/controller.h index 5f19d0bdd5..f8b17ea229 100644 --- a/src/controller/controller.h +++ b/src/controller/controller.h @@ -19,6 +19,8 @@ struct Controller { uint16_t port; char *api_bus_service_name; + long heartbeat_interval_msec; + long heartbeat_threshold_msec; sd_event *event; sd_event_source *node_connection_source; diff --git a/src/controller/node.c b/src/controller/node.c index ccf155794e..317006baee 100644 --- a/src/controller/node.c +++ b/src/controller/node.c @@ -520,11 +520,13 @@ static int node_match_job_done(UNUSED sd_bus_message *m, UNUSED void *userdata, static int node_match_heartbeat(UNUSED sd_bus_message *m, void *userdata, UNUSED sd_bus_error *error) { Node *node = userdata; - int r = get_time_seconds(&node->last_seen); - if (r < 0) { - bc_log_errorf("Failed to get current time on heartbeat: %s", strerror(-r)); + uint64_t now = get_time_micros(); + if (now == 0) { + bc_log_error("Failed to get current time on heartbeat"); return 0; } + + node->last_seen = now; return 1; } @@ -787,7 +789,7 @@ bool node_set_agent_bus(Node *node, sd_bus *bus) { bc_log_errorf("Failed to emit status property changed: %s", strerror(-r)); } - sd_bus_match_signal( + r = sd_bus_match_signal( bus, NULL, NULL, @@ -887,6 +889,8 @@ static int node_method_register(sd_bus_message *m, void *userdata, UNUSED sd_bus m, SD_BUS_ERROR_ADDRESS_IN_USE, "The node is already connected"); } + named_node->last_seen = get_time_micros(); + r = asprintf(&description, "node-%s", name); if (r >= 0) { (void) sd_bus_set_description(node->agent_bus, description); @@ -915,6 +919,13 @@ static int node_method_register(sd_bus_message *m, void *userdata, UNUSED sd_bus static int node_disconnected(UNUSED sd_bus_message *message, void *userdata, UNUSED sd_bus_error *error) { Node *node = userdata; + + node_disconnect(node); + + return 0; +} + +void node_disconnect(Node *node) { Controller *controller = node->controller; void *item = NULL; size_t i = 0; @@ -1009,8 +1020,6 @@ static int node_disconnected(UNUSED sd_bus_message *message, void *userdata, UNU /* update number of online nodes and check the new system state */ controller_check_system_status(controller, controller->number_of_nodes_online--); } - - return 0; } const char *node_get_status(Node *node) { diff --git a/src/controller/node.h b/src/controller/node.h index 96c109909b..61824876a5 100644 --- a/src/controller/node.h +++ b/src/controller/node.h @@ -62,7 +62,7 @@ struct Node { LIST_HEAD(ProxyDependency, proxy_dependencies); struct hashmap *unit_subscriptions; - time_t last_seen; + uint64_t last_seen; bool is_shutdown; }; @@ -71,6 +71,7 @@ Node *node_new(Controller *controller, const char *name); Node *node_ref(Node *node); void node_unref(Node *node); void node_shutdown(Node *node); +void node_disconnect(Node *node); const char *node_get_status(Node *node); diff --git a/src/libbluechi/common/cfg.c b/src/libbluechi/common/cfg.c index 43f3d4bb3d..fd1d485a73 100644 --- a/src/libbluechi/common/cfg.c +++ b/src/libbluechi/common/cfg.c @@ -507,5 +507,18 @@ int cfg_controller_def_conf(struct config *config) { return result; } + if ((result = cfg_set_value( + config, CFG_HEARTBEAT_INTERVAL, CONTROLLER_DEFAULT_HEARTBEAT_INTERVAL_MSEC)) != + 0) { + return result; + } + + if ((result = cfg_set_value( + config, + CFG_HEARTBEAT_THRESHOLD, + CONTROLLER_DEFAULT_NODE_HEARTBEAT_THRESHOLD_MSEC)) != 0) { + return result; + } + return 0; } diff --git a/src/libbluechi/common/cfg.h b/src/libbluechi/common/cfg.h index 70e76dd6ba..93c5063abf 100644 --- a/src/libbluechi/common/cfg.h +++ b/src/libbluechi/common/cfg.h @@ -20,6 +20,7 @@ #define CFG_NODE_NAME "NodeName" #define CFG_ALLOWED_NODE_NAMES "AllowedNodeNames" #define CFG_HEARTBEAT_INTERVAL "HeartbeatInterval" +#define CFG_HEARTBEAT_THRESHOLD "NodeHeartbeatThreshold" #define CFG_IP_RECEIVE_ERRORS "IPReceiveErrors" #define CFG_TCP_KEEPALIVE_TIME "TCPKeepAliveTime" #define CFG_TCP_KEEPALIVE_INTERVAL "TCPKeepAliveInterval" diff --git a/src/libbluechi/common/protocol.h b/src/libbluechi/common/protocol.h index b5552a0053..1c0068808c 100644 --- a/src/libbluechi/common/protocol.h +++ b/src/libbluechi/common/protocol.h @@ -10,6 +10,7 @@ /* Time constants */ #define USEC_PER_SEC 1000000 #define USEC_PER_MSEC 1000 +#define NSEC_PER_USEC 1000 /* Configuration defaults */ #define BC_DEFAULT_PORT "842" @@ -26,6 +27,8 @@ #define BC_DEFAULT_DBUS_TIMEOUT ((uint64_t) (USEC_PER_SEC * 30)) /* Application-level heartbeat interval */ #define AGENT_DEFAULT_HEARTBEAT_INTERVAL_MSEC "2000" +#define CONTROLLER_DEFAULT_HEARTBEAT_INTERVAL_MSEC "0" +#define CONTROLLER_DEFAULT_NODE_HEARTBEAT_THRESHOLD_MSEC "6000" /* BlueChi DBus service names */ diff --git a/tests/bluechi_test/config.py b/tests/bluechi_test/config.py index e8360b5198..ee5995104d 100644 --- a/tests/bluechi_test/config.py +++ b/tests/bluechi_test/config.py @@ -32,6 +32,8 @@ def __init__( name: str = "bluechi-controller", port: str = "8420", allowed_node_names: List[str] = [], + heartbeat_interval: str = "0", + node_heartbeat_threshold: str = "6000", log_level: str = "DEBUG", log_target: str = "journald", log_is_quiet: bool = False, @@ -41,6 +43,8 @@ def __init__( self.name = name self.port = port self.allowed_node_names = allowed_node_names + self.heartbeat_interval = heartbeat_interval + self.node_heartbeat_threshold = node_heartbeat_threshold self.log_level = log_level self.log_target = log_target self.log_is_quiet = log_is_quiet @@ -53,6 +57,8 @@ def serialize(self) -> str: return f"""[bluechi-controller] ControllerPort={self.port} AllowedNodeNames={allowed_node_names} +HeartbeatInterval={self.heartbeat_interval} +NodeHeartbeatThreshold={self.node_heartbeat_threshold} LogLevel={self.log_level} LogTarget={self.log_target} LogIsQuiet={self.log_is_quiet} diff --git a/tests/tests/tier0/bluechi-heartbeat-disabled/main.fmf b/tests/tests/tier0/bluechi-heartbeat-disabled/main.fmf new file mode 100644 index 0000000000..184c2b40cc --- /dev/null +++ b/tests/tests/tier0/bluechi-heartbeat-disabled/main.fmf @@ -0,0 +1,3 @@ +summary: Test if default configuration of controller disables periodic heartbeat of + the controller +id: a04517f6-8be1-468b-8dc0-f9674690f73f diff --git a/tests/tests/tier0/bluechi-heartbeat-disabled/python/is_node_connected.py b/tests/tests/tier0/bluechi-heartbeat-disabled/python/is_node_connected.py new file mode 100644 index 0000000000..3caa8b4f79 --- /dev/null +++ b/tests/tests/tier0/bluechi-heartbeat-disabled/python/is_node_connected.py @@ -0,0 +1,35 @@ +# +# Copyright Contributors to the Eclipse BlueChi project +# +# SPDX-License-Identifier: LGPL-2.1-or-later + +import time +import unittest + +from dasbus.typing import Variant + +from bluechi.api import Node + + +class TestNodeIsConnected(unittest.TestCase): + def test_node_is_connected(self): + + def on_state_change(state: Variant): + assert False + + n = Node("node-foo") + assert n.status == "online" + + n.on_status_changed(on_state_change) + timestamp = n.last_seen_timestamp + + # verify that the node remains connected and LastSeenTimespamp is not + # updated after more than NodeHeartbeatThreshold seconds have elapsed + time.sleep(10) + + assert n.status == "online" + assert n.last_seen_timestamp == timestamp + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tests/tier0/bluechi-heartbeat-disabled/test_bluechi_heartbeat_disabled.py b/tests/tests/tier0/bluechi-heartbeat-disabled/test_bluechi_heartbeat_disabled.py new file mode 100644 index 0000000000..9f1aae3238 --- /dev/null +++ b/tests/tests/tier0/bluechi-heartbeat-disabled/test_bluechi_heartbeat_disabled.py @@ -0,0 +1,36 @@ +# +# Copyright Contributors to the Eclipse BlueChi project +# +# SPDX-License-Identifier: LGPL-2.1-or-later + +import os +from typing import Dict + +from bluechi_test.config import BluechiAgentConfig, BluechiControllerConfig +from bluechi_test.machine import BluechiAgentMachine, BluechiControllerMachine +from bluechi_test.test import BluechiTest + +node_foo_name = "node-foo" + + +def exec(ctrl: BluechiControllerMachine, nodes: Dict[str, BluechiAgentMachine]): + result, output = ctrl.run_python(os.path.join("python", "is_node_connected.py")) + if result != 0: + raise Exception(output) + + +def test_bluechi_heartbeat_disabled( + bluechi_test: BluechiTest, + bluechi_ctrl_default_config: BluechiControllerConfig, + bluechi_node_default_config: BluechiAgentConfig, +): + + bluechi_ctrl_default_config.allowed_node_names = [node_foo_name] + + bluechi_node_default_config.node_name = node_foo_name + bluechi_node_default_config.heartbeat_interval = "0" + + bluechi_test.set_bluechi_controller_config(bluechi_ctrl_default_config) + bluechi_test.add_bluechi_agent_config(bluechi_node_default_config) + + bluechi_test.run(exec) diff --git a/tests/tests/tier0/bluechi-heartbeat-node-disconnected/main.fmf b/tests/tests/tier0/bluechi-heartbeat-node-disconnected/main.fmf new file mode 100644 index 0000000000..745433d523 --- /dev/null +++ b/tests/tests/tier0/bluechi-heartbeat-node-disconnected/main.fmf @@ -0,0 +1,3 @@ +summary: Test if the node gets disconnected when did not receive heartbeat since + threshold from node +id: 0b087ba3-25fc-46b0-9c01-31bc2edf5209 diff --git a/tests/tests/tier0/bluechi-heartbeat-node-disconnected/python/is_node_disconnected.py b/tests/tests/tier0/bluechi-heartbeat-node-disconnected/python/is_node_disconnected.py new file mode 100644 index 0000000000..dd915ef102 --- /dev/null +++ b/tests/tests/tier0/bluechi-heartbeat-node-disconnected/python/is_node_disconnected.py @@ -0,0 +1,43 @@ +# +# Copyright Contributors to the Eclipse BlueChi project +# +# SPDX-License-Identifier: LGPL-2.1-or-later + +import time +import unittest + +from dasbus.loop import EventLoop +from dasbus.typing import Variant + +from bluechi.api import Node + + +class TestNodeIsDisconnected(unittest.TestCase): + + node_state = None + timestamp = None + + def test_node_is_disconnected(self): + loop = EventLoop() + + def on_state_change(state: Variant): + self.timestamp = round(time.time() * 1000000) + self.node_state = state.get_string() + loop.quit() + + n = Node("node-foo") + assert n.status == "online" + + n.on_status_changed(on_state_change) + timestamp = n.last_seen_timestamp + + loop.run() + + # verify that the node is disconnected after more than + # NodeHeartbeatThreshold seconds have elapsed + assert self.timestamp - timestamp > 6000000 + assert self.node_state == "offline" + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tests/tier0/bluechi-heartbeat-node-disconnected/test_bluechi_heartbeat_node_disconnected.py b/tests/tests/tier0/bluechi-heartbeat-node-disconnected/test_bluechi_heartbeat_node_disconnected.py new file mode 100644 index 0000000000..0b3a637033 --- /dev/null +++ b/tests/tests/tier0/bluechi-heartbeat-node-disconnected/test_bluechi_heartbeat_node_disconnected.py @@ -0,0 +1,37 @@ +# +# Copyright Contributors to the Eclipse BlueChi project +# +# SPDX-License-Identifier: LGPL-2.1-or-later + +import os +from typing import Dict + +from bluechi_test.config import BluechiAgentConfig, BluechiControllerConfig +from bluechi_test.machine import BluechiAgentMachine, BluechiControllerMachine +from bluechi_test.test import BluechiTest + +node_foo_name = "node-foo" + + +def exec(ctrl: BluechiControllerMachine, nodes: Dict[str, BluechiAgentMachine]): + result, output = ctrl.run_python(os.path.join("python", "is_node_disconnected.py")) + if result != 0: + raise Exception(output) + + +def test_bluechi_heartbeat_node_disconnected( + bluechi_test: BluechiTest, + bluechi_ctrl_default_config: BluechiControllerConfig, + bluechi_node_default_config: BluechiAgentConfig, +): + + bluechi_ctrl_default_config.allowed_node_names = [node_foo_name] + bluechi_ctrl_default_config.heartbeat_interval = "2000" + + bluechi_node_default_config.node_name = node_foo_name + bluechi_node_default_config.heartbeat_interval = "0" + + bluechi_test.set_bluechi_controller_config(bluechi_ctrl_default_config) + bluechi_test.add_bluechi_agent_config(bluechi_node_default_config) + + bluechi_test.run(exec)