Skip to content

Commit

Permalink
feat(healthcheck): Various healthcheck improvements (#1166)
Browse files Browse the repository at this point in the history
## What ❔

- Adds `HeathStatus::ShuttingDown` set immediately after a component
receives a termination signal. Makes the `/health` endpoint conforming
to K8s readiness probe expectations.
- Makes slow / hard time limits for health checks configurable and
decreases their values by default.
- Adds metric for slow, timed out and dropped health checks.

## Why ❔

Improves healthcheck observability.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
- [x] Spellcheck has been run via `zk spellcheck`.
- [x] Linkcheck has been run via `zk linkcheck`.

---------

Co-authored-by: Fedor Sakharov <[email protected]>
  • Loading branch information
slowli and montekki authored Feb 26, 2024
1 parent 6cd69aa commit 1e34148
Show file tree
Hide file tree
Showing 18 changed files with 520 additions and 194 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions core/bin/external_node/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,14 @@ pub struct OptionalENConfig {
#[serde(default)]
pub filters_disabled: bool,

// Health checks
/// Time limit in milliseconds to mark a health check as slow and log the corresponding warning.
/// If not specified, the default value in the health check crate will be used.
healthcheck_slow_time_limit_ms: Option<u64>,
/// Time limit in milliseconds to abort a health check and return "not ready" status for the corresponding component.
/// If not specified, the default value in the health check crate will be used.
healthcheck_hard_time_limit_ms: Option<u64>,

// Gas estimation config
/// The factor by which to scale the gasLimit
#[serde(default = "OptionalENConfig::default_estimate_gas_scale_factor")]
Expand Down Expand Up @@ -377,6 +385,16 @@ impl OptionalENConfig {
pub fn max_response_body_size(&self) -> usize {
self.max_response_body_size_mb * BYTES_IN_MEGABYTE
}

pub fn healthcheck_slow_time_limit(&self) -> Option<Duration> {
self.healthcheck_slow_time_limit_ms
.map(Duration::from_millis)
}

pub fn healthcheck_hard_time_limit(&self) -> Option<Duration> {
self.healthcheck_hard_time_limit_ms
.map(Duration::from_millis)
}
}

/// This part of the external node config is required for its operation.
Expand Down
5 changes: 4 additions & 1 deletion core/bin/external_node/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,10 @@ async fn main() -> anyhow::Result<()> {

let main_node_client = <dyn MainNodeClient>::json_rpc(&main_node_url)
.context("Failed creating JSON-RPC client for main node")?;
let app_health = Arc::new(AppHealthCheck::default());
let app_health = Arc::new(AppHealthCheck::new(
config.optional.healthcheck_slow_time_limit(),
config.optional.healthcheck_hard_time_limit(),
));
app_health.insert_custom_component(Arc::new(MainNodeHealthCheck::from(
main_node_client.clone(),
)));
Expand Down
14 changes: 14 additions & 0 deletions core/lib/config/src/configs/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,26 @@ impl Web3JsonRpcConfig {
pub struct HealthCheckConfig {
/// Port to which the REST server is listening.
pub port: u16,
/// Time limit in milliseconds to mark a health check as slow and log the corresponding warning.
/// If not specified, the default value in the health check crate will be used.
pub slow_time_limit_ms: Option<u64>,
/// Time limit in milliseconds to abort a health check and return "not ready" status for the corresponding component.
/// If not specified, the default value in the health check crate will be used.
pub hard_time_limit_ms: Option<u64>,
}

impl HealthCheckConfig {
pub fn bind_addr(&self) -> SocketAddr {
SocketAddr::new("0.0.0.0".parse().unwrap(), self.port)
}

pub fn slow_time_limit(&self) -> Option<Duration> {
self.slow_time_limit_ms.map(Duration::from_millis)
}

pub fn hard_time_limit(&self) -> Option<Duration> {
self.hard_time_limit_ms.map(Duration::from_millis)
}
}

#[derive(Debug, Deserialize, Clone, PartialEq)]
Expand Down
6 changes: 5 additions & 1 deletion core/lib/config/src/testonly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,11 @@ impl RandomConfig for configs::api::Web3JsonRpcConfig {

impl RandomConfig for configs::api::HealthCheckConfig {
fn sample(g: &mut Gen<impl Rng>) -> Self {
Self { port: g.gen() }
Self {
port: g.gen(),
slow_time_limit_ms: g.gen(),
hard_time_limit_ms: g.gen(),
}
}
}

Expand Down
8 changes: 7 additions & 1 deletion core/lib/env_config/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,11 @@ mod tests {
pushgateway_url: "http://127.0.0.1:9091".into(),
push_interval_ms: Some(100),
},
healthcheck: HealthCheckConfig { port: 8081 },
healthcheck: HealthCheckConfig {
port: 8081,
slow_time_limit_ms: Some(250),
hard_time_limit_ms: Some(2_000),
},
merkle_tree: MerkleTreeApiConfig { port: 8082 },
}
}
Expand Down Expand Up @@ -138,6 +142,8 @@ mod tests {
API_PROMETHEUS_PUSHGATEWAY_URL="http://127.0.0.1:9091"
API_PROMETHEUS_PUSH_INTERVAL_MS=100
API_HEALTHCHECK_PORT=8081
API_HEALTHCHECK_SLOW_TIME_LIMIT_MS=250
API_HEALTHCHECK_HARD_TIME_LIMIT_MS=2000
API_MERKLE_TREE_PORT=8082
"#;
lock.set_env(config);
Expand Down
2 changes: 2 additions & 0 deletions core/lib/health_check/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ keywords = ["blockchain", "zksync"]
categories = ["cryptography"]

[dependencies]
vise = { git = "https://github.com/matter-labs/vise.git", version = "0.1.0", rev = "1c9cc500e92cf9ea052b230e114a6f9cce4fb2c1" }

async-trait = "0.1"
futures = "0.3"
serde = { version = "1.0", features = ["derive"] }
Expand Down
122 changes: 122 additions & 0 deletions core/lib/health_check/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Health Monitoring

Healthcheck infrastructure for node components allowing components to signal their current health state. Health states
for all components run by the node are aggregated and are exposed as an HTTP `GET /health` endpoint bound to a dedicated
healthcheck port, both for the main node and external node. This endpoint can be used as a readiness probe for
Kubernetes, or used in other automations.

## Main concepts

**Component** is a logically isolated part of a node that affects the ability of the node to handle requests (aka node
health). Components are supposed to run indefinitely until the node receives a stop signal.

- Internal components correspond to one or more Tokio tasks. Examples of internal components are: JSON-RPC API server,
Merkle tree, consistency checker, reorg detector.
- External components correspond to another process that the node communicates with. Examples of external components
are: Postgres connection pool, main node JSON-RPC (for the external node).

Each component can report its health, which consists of 2 parts:

- **Status**, e.g., "not ready", "ready", "shut down", "panicked"; see the crate code for a full list.
- **Details**, a JSON value with the component-specific schema. E.g., Merkle tree reports its L1 batch "cursor" as a
part of this information.

Health from all components is aggregated into **application health**, which has its own status computed as the worst of
component statuses. Application health is returned by the `/health` endpoint.

## `/health` endpoint format

`/health` will return current application health encoded as a JSON object. The HTTP status of the response is 20x if the
application is healthy, and 50x if it is not.

> **Warning.** The schema of data returned by the `/health` endpoint is not stable at this point and can change without
> notice. Use at your own risk.
<details>
<summary>Example of endpoint output for an external node:</summary>

```json
{
"status": "ready",
"components": {
"sync_state": {
"status": "ready",
"details": {
"is_synced": true,
"local_block": 91,
"main_node_block": 91
}
},
"connection_pool": {
"status": "ready",
"details": {
"max_size": 50,
"pool_size": 10
}
},
"tree": {
"status": "ready",
"details": {
"leaf_count": 12624,
"mode": "full",
"next_l1_batch_number": 26,
"root_hash": "0x54d537798f9ebd1b6463e3773c3549a389709987d559fdcd8d402a652a33fb68",
"stage": "main_loop"
}
},
"snapshot_recovery": {
"status": "ready",
"details": {
"factory_deps_recovered": true,
"snapshot_l1_batch": 24,
"snapshot_miniblock": 89,
"storage_logs_chunk_count": 10,
"storage_logs_chunks_left_to_process": 0,
"tokens_recovered": true
}
},
"consistency_checker": {
"status": "ready",
"details": {
"first_checked_batch": 25,
"last_checked_batch": 25
}
},
"ws_api": {
"status": "ready"
},
"prometheus_exporter": {
"status": "ready"
},
"reorg_detector": {
"status": "ready",
"details": {
"last_correct_l1_batch": 25,
"last_correct_miniblock": 91
}
},
"main_node_http_rpc": {
"status": "ready"
},
"batch_status_updater": {
"status": "ready",
"details": {
"last_committed_l1_batch": 25,
"last_executed_l1_batch": 25,
"last_proven_l1_batch": 25
}
},
"commitment_generator": {
"status": "ready",
"details": {
"l1_batch_number": 25
}
},
"http_api": {
"status": "ready"
}
}
}
```

</details>
Loading

0 comments on commit 1e34148

Please sign in to comment.