Skip to content

Commit

Permalink
Introduce separate health_metrics crate
Browse files Browse the repository at this point in the history
* separate health_metrics crate

* remove metrics from warp_utils

* move ProcessHealth::observe and SystemHealth::observe to health_metrics

* fix errors

* nitpick `Cargo.toml`s

---------

Co-authored-by: Daniel Knopik <[email protected]>
  • Loading branch information
ThreeHrSleep and dknopik authored Jan 2, 2025
1 parent 099e2b6 commit 39b5dfd
Show file tree
Hide file tree
Showing 20 changed files with 171 additions and 132 deletions.
17 changes: 15 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ members = [
"common/eth2_interop_keypairs",
"common/eth2_network_config",
"common/eth2_wallet_manager",
"common/health_metrics",
"common/lighthouse_version",
"common/lockfile",
"common/logging",
Expand Down Expand Up @@ -247,6 +248,7 @@ filesystem = { path = "common/filesystem" }
fork_choice = { path = "consensus/fork_choice" }
genesis = { path = "beacon_node/genesis" }
gossipsub = { path = "beacon_node/lighthouse_network/gossipsub/" }
health_metrics = { path = "common/health_metrics" }
http_api = { path = "beacon_node/http_api" }
initialized_validators = { path = "validator_client/initialized_validators" }
int_to_bytes = { path = "consensus/int_to_bytes" }
Expand Down
1 change: 1 addition & 0 deletions beacon_node/http_api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ ethereum_serde_utils = { workspace = true }
ethereum_ssz = { workspace = true }
execution_layer = { workspace = true }
futures = { workspace = true }
health_metrics = { workspace = true }
hex = { workspace = true }
lighthouse_network = { workspace = true }
lighthouse_version = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions beacon_node/http_api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ use eth2::types::{
ValidatorStatus, ValidatorsRequestBody,
};
use eth2::{CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER};
use health_metrics::observe::Observe;
use lighthouse_network::{types::SyncState, EnrExt, NetworkGlobals, PeerId, PubsubMessage};
use lighthouse_version::version_with_platform;
use logging::SSELoggingComponents;
Expand Down
1 change: 1 addition & 0 deletions beacon_node/http_metrics/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = { workspace = true }

[dependencies]
beacon_chain = { workspace = true }
health_metrics = { workspace = true }
lighthouse_network = { workspace = true }
lighthouse_version = { workspace = true }
malloc_utils = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion beacon_node/http_metrics/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ pub fn gather_prometheus_metrics<T: BeaconChainTypes>(

lighthouse_network::scrape_discovery_metrics();

warp_utils::metrics::scrape_health_metrics();
health_metrics::metrics::scrape_health_metrics();

// It's important to ensure these metrics are explicitly enabled in the case that users aren't
// using glibc and this function causes panics.
Expand Down
6 changes: 1 addition & 5 deletions common/eth2/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ zeroize = { workspace = true }
[dev-dependencies]
tokio = { workspace = true }

[target.'cfg(target_os = "linux")'.dependencies]
psutil = { version = "3.3.0", optional = true }
procfs = { version = "0.15.1", optional = true }

[features]
default = ["lighthouse"]
lighthouse = ["psutil", "procfs"]
lighthouse = []
122 changes: 0 additions & 122 deletions common/eth2/src/lighthouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,6 @@ pub struct ValidatorInclusionData {
pub is_previous_epoch_head_attester: bool,
}

#[cfg(target_os = "linux")]
use {
psutil::cpu::os::linux::CpuTimesExt, psutil::memory::os::linux::VirtualMemoryExt,
psutil::process::Process,
};

/// Reports on the health of the Lighthouse instance.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Health {
Expand Down Expand Up @@ -164,69 +158,6 @@ pub struct SystemHealth {
pub misc_os: String,
}

impl SystemHealth {
#[cfg(not(target_os = "linux"))]
pub fn observe() -> Result<Self, String> {
Err("Health is only available on Linux".into())
}

#[cfg(target_os = "linux")]
pub fn observe() -> Result<Self, String> {
let vm = psutil::memory::virtual_memory()
.map_err(|e| format!("Unable to get virtual memory: {:?}", e))?;
let loadavg =
psutil::host::loadavg().map_err(|e| format!("Unable to get loadavg: {:?}", e))?;

let cpu =
psutil::cpu::cpu_times().map_err(|e| format!("Unable to get cpu times: {:?}", e))?;

let disk_usage = psutil::disk::disk_usage("/")
.map_err(|e| format!("Unable to disk usage info: {:?}", e))?;

let disk = psutil::disk::DiskIoCountersCollector::default()
.disk_io_counters()
.map_err(|e| format!("Unable to get disk counters: {:?}", e))?;

let net = psutil::network::NetIoCountersCollector::default()
.net_io_counters()
.map_err(|e| format!("Unable to get network io counters: {:?}", e))?;

let boot_time = psutil::host::boot_time()
.map_err(|e| format!("Unable to get system boot time: {:?}", e))?
.duration_since(std::time::UNIX_EPOCH)
.map_err(|e| format!("Boot time is lower than unix epoch: {}", e))?
.as_secs();

Ok(Self {
sys_virt_mem_total: vm.total(),
sys_virt_mem_available: vm.available(),
sys_virt_mem_used: vm.used(),
sys_virt_mem_free: vm.free(),
sys_virt_mem_cached: vm.cached(),
sys_virt_mem_buffers: vm.buffers(),
sys_virt_mem_percent: vm.percent(),
sys_loadavg_1: loadavg.one,
sys_loadavg_5: loadavg.five,
sys_loadavg_15: loadavg.fifteen,
cpu_cores: psutil::cpu::cpu_count_physical(),
cpu_threads: psutil::cpu::cpu_count(),
system_seconds_total: cpu.system().as_secs(),
cpu_time_total: cpu.total().as_secs(),
user_seconds_total: cpu.user().as_secs(),
iowait_seconds_total: cpu.iowait().as_secs(),
idle_seconds_total: cpu.idle().as_secs(),
disk_node_bytes_total: disk_usage.total(),
disk_node_bytes_free: disk_usage.free(),
disk_node_reads_total: disk.read_count(),
disk_node_writes_total: disk.write_count(),
network_node_bytes_total_received: net.bytes_recv(),
network_node_bytes_total_transmit: net.bytes_sent(),
misc_node_boot_ts_seconds: boot_time,
misc_os: std::env::consts::OS.to_string(),
})
}
}

/// Process specific health
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ProcessHealth {
Expand All @@ -244,59 +175,6 @@ pub struct ProcessHealth {
pub pid_process_seconds_total: u64,
}

impl ProcessHealth {
#[cfg(not(target_os = "linux"))]
pub fn observe() -> Result<Self, String> {
Err("Health is only available on Linux".into())
}

#[cfg(target_os = "linux")]
pub fn observe() -> Result<Self, String> {
let process =
Process::current().map_err(|e| format!("Unable to get current process: {:?}", e))?;

let process_mem = process
.memory_info()
.map_err(|e| format!("Unable to get process memory info: {:?}", e))?;

let me = procfs::process::Process::myself()
.map_err(|e| format!("Unable to get process: {:?}", e))?;
let stat = me
.stat()
.map_err(|e| format!("Unable to get stat: {:?}", e))?;

let process_times = process
.cpu_times()
.map_err(|e| format!("Unable to get process cpu times : {:?}", e))?;

Ok(Self {
pid: process.pid(),
pid_num_threads: stat.num_threads,
pid_mem_resident_set_size: process_mem.rss(),
pid_mem_virtual_memory_size: process_mem.vms(),
pid_mem_shared_memory_size: process_mem.shared(),
pid_process_seconds_total: process_times.busy().as_secs()
+ process_times.children_system().as_secs()
+ process_times.children_system().as_secs(),
})
}
}

impl Health {
#[cfg(not(target_os = "linux"))]
pub fn observe() -> Result<Self, String> {
Err("Health is only available on Linux".into())
}

#[cfg(target_os = "linux")]
pub fn observe() -> Result<Self, String> {
Ok(Self {
process: ProcessHealth::observe()?,
system: SystemHealth::observe()?,
})
}
}

/// Indicates how up-to-date the Eth1 caches are.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Eth1SyncStatusData {
Expand Down
12 changes: 12 additions & 0 deletions common/health_metrics/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[package]
name = "health_metrics"
version = "0.1.0"
edition = { workspace = true }

[dependencies]
eth2 = { workspace = true }
metrics = { workspace = true }

[target.'cfg(target_os = "linux")'.dependencies]
psutil = "3.3.0"
procfs = "0.15.1"
2 changes: 2 additions & 0 deletions common/health_metrics/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub mod metrics;
pub mod observe;
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::observe::Observe;
use eth2::lighthouse::{ProcessHealth, SystemHealth};
use metrics::*;
use std::sync::LazyLock;
Expand Down
Loading

0 comments on commit 39b5dfd

Please sign in to comment.