diff --git a/CHANGES.d/20240417_145430_mb_FC_35378_httpwatchdog.md b/CHANGES.d/20240417_145430_mb_FC_35378_httpwatchdog.md new file mode 100644 index 0000000..714c63a --- /dev/null +++ b/CHANGES.d/20240417_145430_mb_FC_35378_httpwatchdog.md @@ -0,0 +1,4 @@ +* Added component `batou_ext.http.HTTPServiceWatchdog` that adds a check to a systemd unit + whether a given URL is reachable (e.g. a `/health` endpoint). If the URL cannot be reached within + a certain interval, the service will be restarted. Further details are documented in the + docstring. diff --git a/src/batou_ext/http.py b/src/batou_ext/http.py index 8c126b9..1ebf88d 100644 --- a/src/batou_ext/http.py +++ b/src/batou_ext/http.py @@ -1,5 +1,8 @@ import batou.component import batou.lib.file +import pkg_resources + +import batou_ext.nix class HTTPBasicAuth(batou.component.Component): @@ -56,3 +59,186 @@ def _deploy_customer_http_auth_file(self): "htpasswd_{}".format(self.env_name), content=self.basic_auth_string ) self.path = self._.path + + +@batou_ext.nix.rebuild +class HTTPServiceWatchdog(batou.component.Component): + """ + Adds a "watchdog" on top of a systemd service that checks within a given interval + whether an HTTP URL is available and restarts the service if not. + + Usage: + + self += HTTPWatchdogScript() + self += HTTPServiceWatchdog( + "application", + script=self.path_to_script_that_starts_application, + healthcheck_url="https://example.com", + watchdog_script_path=self._.path + ) + + This creates a systemd service `application.service` that will be restarted + if no HTTP request every 64s against :arg:`healthcheck_url` is successful. + + Arguments + --------- + + * `service`: name of the systemd unit a watchdog is attached to. + * `predefined_service`: whether the systemd unit is defined in another + module, e.g. in NixOS itself (see below). + * `script`: path to the script that gets started by the service. + * `healthcheck_url`: the URL HTTP requests are issued against. + * `healthcheck_timeout`: timeout for the HTTP request against :arg:`healthcheck_url`. + * `check_interval`: time to sleep between two healthchecks. + * `startup_check_interval`: time to sleep between healthchecks on startup. + * `start_timeout`: how long until the first HTTP request must pass (see below for details). + * `watchdog_interval`: interval between which one healthcheck must pass. + + Existing services + ----------------- + + If the watchdog needs to be added to an existing service, e.g. + a service from NixOS itself, it can be done like this: + + self += HTTPServiceWatchdog( + "docker-foobar", + predefined_service=True, + # ... + ) + + This only works if `systemd.services.docker-foobar.script` was declared + by the module defining `docker-foobar.service`. If + `systemd.services.docker-foobar.serviceConfig.ExecStart` is used by the module, + this won't work. + + Inner workings + -------------- + + `systemd` provides a watchdog feature: the service needs to send `WATCHDOG=1` once in + a given interval to `sd_notify(3)`. If that doesn't happen, the service will be aborted. + Via an auto-restart defined in the unit, it will be started up again. + + This mechanism is used by this component: + + * It overrides the systemd service with a script that forks: + * The parent process is the running application. This one + must be started by the executable file passed to this + component via :arg:`script`. + * The child is a script that regularly checks if `healthcheck_url` + is available and sends `WATCHDOG=1` if that's the case. + + * Certain applications can take a while until these are started up. + When the unit is started, the forked off child process sends a request + to :arg:`healthcheck_url` every :arg:`startup_check_interval` seconds. + + This is done until either + + * One request succeeds. The application is considered "up" by then + and the watchdog starts. + + Before that's the case, `systemd` will list the unit as "activating" + rather than "active". + + * A timeout set by :arg:`start_timeout` is reached. `systemd` will abort + the application startup and terminate the unit. Because the unit is configured + to restart, a startup will be attempted again. + + * If the startup attempts should be aborted eventually, this can be implemented + via the options `StartLimitBurst`/`StartLimitIntervalSec` in the systemd unit. + + The options are documented in + `systemd.unit(5) `. + + See below on how to modify the unit created by this component. + + * As long as :arg:`healthcheck_url` is up, a request will be issued every + :arg:`check_interval` seconds and after that `WATCHDOG=1` will be sent + to `sd_notify(3)`. + + As soon as the URL isn't successful anymore, an exponential backoff will + be made: this means that after the request against :arg:`healthcheck_url` + failed `n` times, 2^n seconds of sleep between two healthcheck checks are added + to the :arg:`check_interval` seconds of sleep. + + For instance, after one failed request, the watchdog will wait 2^1=2 seconds + in addition to :arg:`check_interval`. + + If :arg:`watchdog_interval` is 32 and :arg:`check_interval` is 1, there are + three attempts to recover. + + I.e. 2^n seconds of sleep + 1s of sleep interval. + For each of the three attempts so far this is `2^1 + 1 + 2^2 + 1 * 2^3 + 1 = 17s`. + The timeout of 1s for the HTTP request is negligible in this example + and thus left out. + + Because this is the fourth failure, a sleep of 2^4 + 1 seconds (=17s) will + be started . Since 17+17 > 32, the watchdog will kill the process before another + attempt can be made. + + Customize the systemd unit + -------------------------- + + The systemd unit will be written as Nix code to the target machine. + This means that overrides are possible via the Nix module system. For instance, + `StartLimitBurst`/`StartLimitIntervalSec` can be added like this: + + self += HTTPServiceWatchdog("foobar", ...) + self += File( + "/etc/local/nixos/customize-foobar.nix", + content="{ systemd.services.foobar.serviceConfig = { " + + "StartLimitIntervalSec = ...; StartLimitBurst = ... }; }" + ) + + Limitations + ----------- + + * The service injects a custom Python that's used by the watchdog script. It's highly + recommended to override the PATH in :arg:`script`, e.g. by sourcing `/etc/profile`. + * The running application must not kill the watchdog. + """ + + namevar = "service" + predefined_service = batou.component.Attribute("literal", default=False) + watchdog_script_path = batou.component.Attribute(str) + script = batou.component.Attribute(str, default=None) + + healthcheck_url = batou.component.Attribute(str) + healthcheck_timeout = batou.component.Attribute(int, default=2) + check_interval = batou.component.Attribute(int, default=2) + startup_check_interval = batou.component.Attribute(int, default=4) + start_timeout = batou.component.Attribute(int, default=64) + watchdog_interval = batou.component.Attribute(int, default=64) + + def configure(self): + if self.predefined_service and self.script is not None: + raise ValueError( + f"batou_ext.http.HTTPServiceWatchdog({self.service}): cannot set 'script' if service is predefined." + ) + if not self.predefined_service and self.script is None: + raise ValueError( + f"batou_ext.http.HTTPServiceWatchdog({self.service}): must set script if predefined_service is False." + ) + + self += batou.lib.file.File( + f"/etc/local/nixos/{self.service}-watchdog.nix", + content=pkg_resources.resource_string( + __name__, "resources/http-watchdog.nix" + ), + ) + + +class HTTPWatchdogScript(batou.component.Component): + """ + Writes the wrapper script for the HTTP watchdog into the service user's home. + Only needed in conjunction with :class:`batou_ext.http.HTTPServiceWatchdog`. + """ + + def configure(self): + self += batou.lib.file.File( + "watchdog-wrapper.py", + mode=0o755, + content=pkg_resources.resource_string( + __name__, "resources/watchdog-wrapper.py" + ), + ) + self.path = self._.path diff --git a/src/batou_ext/resources/http-watchdog.nix b/src/batou_ext/resources/http-watchdog.nix new file mode 100644 index 0000000..b7c5d9b --- /dev/null +++ b/src/batou_ext/resources/http-watchdog.nix @@ -0,0 +1,50 @@ +{ config, lib, pkgs, ... }: + +let + script = + # {% if component.predefined_service %} + (pkgs.writeShellScript + "{{ component.service }}-start" + config.systemd.services."{{ component.service }}".script) + # {% else %} + "{{ component.script }}" + # {% endif %} + ; +in { + # {% if component.predefined_service %} + assertions = [ + { assertion = config.systemd.services."{{ component.service }}".script != ""; + message = "Service {{ component.service }} needs to have `script` set!"; + } + ]; + # {% endif %} + + systemd.services."{{ component.service }}" = { + path = [ + config.systemd.package + # interpreter for the watchdog script + (pkgs.python3.withPackages (ps: with ps; [ + requests + ])) + ]; + serviceConfig = { + Type = "notify"; + # Allow child processes of ExecStart + # to do `sd_notify(3)`. + NotifyAccess = "all"; + Restart = "always"; + TimeoutStartSec = lib.mkForce "{{ component.start_timeout }}"; + WatchdogSec = "{{ component.watchdog_interval }}"; + + ExecStart = lib.mkForce (pkgs.writeShellScript "watchdog-{{ component.service }}" '' + exec {{ component.watchdog_script_path }} \ + --healthcheck-url {{ component.healthcheck_url }} \ + --healthcheck-timeout {{ component.healthcheck_timeout }} \ + --watcher-loglevel info \ + --startup-check-interval {{ component.startup_check_interval }} \ + --check-interval {{ component.check_interval }} \ + -- ${script} + ''); + }; + }; +} diff --git a/src/batou_ext/resources/watchdog-wrapper.py b/src/batou_ext/resources/watchdog-wrapper.py new file mode 100644 index 0000000..a5056de --- /dev/null +++ b/src/batou_ext/resources/watchdog-wrapper.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python + +import logging +from argparse import ArgumentParser +from dataclasses import dataclass +from logging import info, warning +from math import ceil +from os import environ, execvp, fork +from subprocess import check_call +from time import sleep + +import requests + + +@dataclass +class HealthCheckResult: + success: bool + summary: Exception | None = None + + def __bool__(self) -> bool: + return self.success + + def __str__(self) -> str: + assert self.summary is not None + return str(self.summary) + + +# Sentinel value for a successful healthcheck. Doesn't +# really make sense to instantiate a new HealthCheckResult whenever +# a healthcheck passes. +HEALTHCHECK_SUCCESS = HealthCheckResult(True) + + +class ExponentialBackoff: + def __init__(self, static_check_interval: int): + self.static_check_interval = static_check_interval + self.last_healthcheck = HEALTHCHECK_SUCCESS + self.__reset() + + def failure(self, last_failure_reason: HealthCheckResult): + self.n_failures += 1 + self.last_healthcheck = last_failure_reason + + def sleep(self, timeout: int): + seconds_to_sleep = self.static_check_interval + (2**self.n_failures) + if self.last_healthcheck: + self.__reset() + else: + self.__report_failing_healthcheck( + seconds_to_sleep, + self.__will_sleep_exceed_watchdog(timeout, seconds_to_sleep), + ) + self.last_healthcheck = HEALTHCHECK_SUCCESS + + sleep(seconds_to_sleep) + + def __report_failing_healthcheck( + self, sleep_seconds: int, will_time_out: bool + ): + log_message = f"Healthcheck failure (Reason: {self.last_healthcheck}), sleeping {sleep_seconds}." + if will_time_out: + warning(f"{log_message} Watchdog will likely kill process") + else: + info(log_message) + + def __will_sleep_exceed_watchdog( + self, watchdog_timeout: int, seconds_to_sleep: int + ) -> bool: + # Seconds that were slept so far + seconds that will be slept. + # This doesn't take the time spent with the healthcheck into account, + # but that's only an approximation for logging. + total_sleep = ( + # Seconds to be slept on the next sleep + seconds_to_sleep + # 2^n seconds of sleep for each 0 < n < self.n_failures + + (2**self.n_failures - 2) + # on each sleep it was additionally slept check_interval seconds + + (self.n_failures - 1) * self.static_check_interval + ) + + return total_sleep >= watchdog_timeout + + def __reset(self): + self.n_failures = 0 + + +def is_service_available(url: str, timeout: int) -> HealthCheckResult: + try: + requests.get(url, timeout=timeout).raise_for_status() + return HEALTHCHECK_SUCCESS + except Exception as ex: + return HealthCheckResult(False, ex) + + +def await_service( + url: str, healthcheck_timeout: int, startup_loop_interval: int +): + # No need to handle timeouts here: due to `Type=notify`, + # this unit won't be up until this loop has terminated. + # The timeout for that can be controlled in the unit directly + # via TimeoutStartSec from `systemd.service(5)`. + while not is_service_available(url, healthcheck_timeout): + info(f"Service isn't up yet, sleeping {startup_loop_interval}s") + sleep(startup_loop_interval) + + # Tell the service-manager that we're ready. + # Because of this, the unit's state transitions from 'activating' to 'active'. + check_call(["systemd-notify", "--ready"]) + info("Service ready, now monitoring.") + + +def monitor_service( + url: str, + watchdog_timeout: int, + healthcheck_timeout: int, + min_check_interval: int, +): + # For WatchdogSec=32 with a sleep of 1s between all attempts there are three attempts + # to recover. + # I.e. 2^n seconds of sleep + 1s of sleep interval. + # For each of the three attempts so far this is + # 2^1 + 1 + 2^2 + 1 * 2^3 + 1 = 17s + # because this is the fourth failure, a sleep of 2^4 + 1 seconds (=17s) will + # be started . Since 17+17 > 32, the watchdog will kill the process before another + # attempt can be made. + # + # The extra 1s (=min_check_interval) is added to provide a configurable grace period + # between healthchecks. In some cases it may not be desirable to issue a healthcheck every second. + exp_backoff = ExponentialBackoff(min_check_interval) + + while True: + if result := is_service_available(url, healthcheck_timeout): + check_call(["systemd-notify", "WATCHDOG=1"]) + else: + exp_backoff.failure(result) + + exp_backoff.sleep(watchdog_timeout) + + +if __name__ == "__main__": + argparser = ArgumentParser() + argparser.add_argument( + "--healthcheck-url", + help="URL to issue a request against to check the service's health", + ) + argparser.add_argument( + "--healthcheck-timeout", + default=2, + type=int, + help="How many seconds until the healthcheck request times out", + ) + argparser.add_argument( + "--watcher-loglevel", + default="warning", + help="Loglevel of the watchdog script, doen't influence the main process", + ) + argparser.add_argument( + "--startup-check-interval", + default=4, + type=int, + help="Seconds to wait between healthchecks when the service is starting up", + ) + argparser.add_argument( + "--check-interval", + default=2, + type=int, + help="Seconds to wait between healtchecks when the service is running", + ) + argparser.add_argument( + "command", + nargs="+", + help="Service process that will be watched by this script", + ) + args = argparser.parse_args() + + # After how much time (in microseconds) the watchdog must be pet. + # Set by systemd directly. + assert ( + "WATCHDOG_USEC" in environ + ), "WATCHDOG_USEC not in environment, please configure WatchdogSec in the systemd service!" + watchdog_timeout = int(environ["WATCHDOG_USEC"]) // 1_000_000 + + pid = fork() + if pid > 0: + # By making the service the main process, it will be restarted immediately when + # the service crashes rather than waiting for the exponential backoff to fail. + pass_args = args.command + execvp(pass_args[0], pass_args) + else: + logging.basicConfig(level=args.watcher_loglevel.upper()) + info("Starting watcher for application as child process") + + await_service( + args.healthcheck_url, + args.healthcheck_timeout, + args.startup_check_interval, + ) + monitor_service( + args.healthcheck_url, + watchdog_timeout, + args.healthcheck_timeout, + args.check_interval, + ) diff --git a/src/batou_ext/tests/test_configure.py b/src/batou_ext/tests/test_configure.py index dd9fd49..dacad13 100644 --- a/src/batou_ext/tests/test_configure.py +++ b/src/batou_ext/tests/test_configure.py @@ -168,4 +168,8 @@ def test_prepare(root, mocker, component, tmpdir): return # instance.name = "mymod" # (tmpdir / "mymod.nix").write_text("{}", encoding="US-ASCII") + elif component_name == "batou_ext.http.HTTPServiceWatchdog": + instance.script = "/srv/s-myuser/start-application.sh" + instance.watchdog_script_path = "/srv/s-myuser/watchdog-wrapper.py" + instance.healthcheck_url = "https://example.com" instance.prepare(root)