Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(node): add fstrim datadir observability #3322

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
24 changes: 17 additions & 7 deletions rs/ic_os/fstrim_tool/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,12 @@ fn write_metrics_using_tmp_file(metrics: &FsTrimMetrics, metrics_filename: &str)
.context("Failed to write metrics to file")
}

fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) -> Result<()> {
fn update_metrics(
elapsed: Duration,
is_success: bool,
metrics_filename: &str,
is_datadir: bool,
) -> Result<()> {
let mut metrics = parse_existing_metrics_from_file(metrics_filename)
.unwrap_or_else(|e| {
eprintln!("error parsing existing metrics: {}", e);
Expand All @@ -71,7 +76,13 @@ fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) -
eprintln!("no existing metrics found");
FsTrimMetrics::default()
});
metrics.update(is_success, elapsed)?;

if is_datadir {
metrics.update_datadir(is_success, elapsed)?;
} else {
metrics.update(is_success, elapsed)?;
}

write_metrics_using_tmp_file(&metrics, metrics_filename)
}

Expand Down Expand Up @@ -101,14 +112,13 @@ pub fn fstrim_tool(
let start = std::time::Instant::now();
let res_target = run_command(command, &target);
let elapsed_target = start.elapsed();
update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename)?;
update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename, false)?;

if !datadir_target.is_empty() && !is_node_assigned() {
// TODO observability changes needed, expand the metrics logic
// let start_datadir = std::time::Instant::now();
let start = std::time::Instant::now();
let res_datadir = run_command(command, &datadir_target);
// let elapsed_datadir = start_datadir.elapsed();
// update_metrics(elapsed_datadir, res_datadir.is_ok(), &metrics_filename)?;
let elapsed = start.elapsed();
update_metrics(elapsed, res_datadir.is_ok(), &metrics_filename, true)?;
res_target.and(res_datadir)
} else {
res_target
Expand Down
72 changes: 65 additions & 7 deletions rs/ic_os/fstrim_tool/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,20 @@ const METRICS_LAST_RUN_DURATION_MILLISECONDS: &str = "fstrim_last_run_duration_m
const METRICS_LAST_RUN_SUCCESS: &str = "fstrim_last_run_success";
const METRICS_RUNS_TOTAL: &str = "fstrim_runs_total";

const METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR: &str =
"fstrim_datadir_last_run_duration_milliseconds";
const METRICS_LAST_RUN_SUCCESS_DATADIR: &str = "fstrim_datadir_last_run_success";
const METRICS_RUNS_TOTAL_DATADIR: &str = "fstrim_datadir_runs_total";

#[derive(Debug)]
pub struct FsTrimMetrics {
pub last_duration_milliseconds: f64,
pub last_run_success: bool,
pub total_runs: f64,

pub last_duration_milliseconds_datadir: f64,
pub last_run_success_datadir: bool,
pub total_runs_datadir: f64,
}

impl Default for FsTrimMetrics {
Expand All @@ -21,6 +30,10 @@ impl Default for FsTrimMetrics {
last_duration_milliseconds: 0f64,
last_run_success: true,
total_runs: 0f64,

last_duration_milliseconds_datadir: 0f64,
last_run_success_datadir: true,
total_runs_datadir: 0f64,
}
}
}
Expand All @@ -33,6 +46,13 @@ impl FsTrimMetrics {
Ok(())
}

pub(crate) fn update_datadir(&mut self, success: bool, duration: Duration) -> Result<()> {
self.total_runs_datadir += 1f64;
self.last_run_success_datadir = success;
self.last_duration_milliseconds_datadir = duration.as_millis() as f64;
Ok(())
}

pub fn to_p8s_metrics_string(&self) -> String {
format!(
"# HELP fstrim_last_run_duration_milliseconds Duration of last run of fstrim in milliseconds\n\
Expand All @@ -43,16 +63,31 @@ impl FsTrimMetrics {
fstrim_last_run_success {}\n\
# HELP fstrim_runs_total Total number of runs of fstrim\n\
# TYPE fstrim_runs_total counter\n\
fstrim_runs_total {}\n",
fstrim_runs_total {}\n\
# HELP fstrim_datadir_last_run_duration_milliseconds Duration of last run of fstrim on datadir in milliseconds\n\
# TYPE fstrim_datadir_last_run_duration_milliseconds gauge\n\
fstrim_datadir_last_run_duration_milliseconds {}\n\
frankdavid marked this conversation as resolved.
Show resolved Hide resolved
# HELP fstrim_datadir_last_run_success Success status of last run of fstrim on datadir (success: 1, failure: 0)\n\
# TYPE fstrim_datadir_last_run_success gauge\n\
fstrim_datadir_last_run_success {}\n\
# HELP fstrim_datadir_runs_total Total number of runs of fstrim on datadir\n\
# TYPE fstrim_datadir_runs_total counter\n\
fstrim_datadir_runs_total {}\n",
to_go_f64(self.last_duration_milliseconds),
if self.last_run_success { "1" } else { "0" },
to_go_f64(self.total_runs),
).to_string()
to_go_f64(self.last_duration_milliseconds_datadir),
if self.last_run_success_datadir { "1" } else { "0" },
to_go_f64(self.total_runs_datadir),
)
.to_string()
andrewbattat marked this conversation as resolved.
Show resolved Hide resolved
}

fn are_valid(&self) -> bool {
is_f64_finite_and_0_or_larger(self.total_runs)
&& is_f64_finite_and_0_or_larger(self.last_duration_milliseconds)
&& is_f64_finite_and_0_or_larger(self.total_runs_datadir)
&& is_f64_finite_and_0_or_larger(self.last_duration_milliseconds_datadir)
}
}

Expand Down Expand Up @@ -102,27 +137,41 @@ where
let mut last_duration_milliseconds: Option<f64> = None;
let mut last_run_success: Option<bool> = None;
let mut total_runs: Option<f64> = None;

// Default datadir fields (we treat them as optional in the metrics file)
let mut datadir_last_duration_milliseconds: f64 = 0f64;
let mut datadir_last_run_success: bool = true;
let mut datadir_total_runs: f64 = 0f64;

for line_or_err in lines {
let line = line_or_err.map_err(|e| format_err!("failed to read line: {}", e))?;
match line.split(' ').collect::<Vec<_>>()[..] {
["#", ..] => continue,
[key, value] => match key {
METRICS_LAST_RUN_DURATION_MILLISECONDS => {
let _ = last_duration_milliseconds
.get_or_insert(parse_metrics_value(key, value)?);
last_duration_milliseconds.get_or_insert(parse_metrics_value(key, value)?);
}
METRICS_LAST_RUN_SUCCESS => {
let _ =
last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64);
last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64);
}
METRICS_RUNS_TOTAL => {
let _ = total_runs.get_or_insert(parse_metrics_value(key, value)?);
total_runs.get_or_insert(parse_metrics_value(key, value)?);
}
METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR => {
datadir_last_duration_milliseconds = parse_metrics_value(key, value)?;
}
METRICS_LAST_RUN_SUCCESS_DATADIR => {
datadir_last_run_success = parse_metrics_value(key, value)? > 0f64;
}
METRICS_RUNS_TOTAL_DATADIR => {
datadir_total_runs = parse_metrics_value(key, value)?;
}
_ => return Err(format_err!("unknown metric key: {}", key)),
},
_ => return Err(format_err!("invalid metric line: {:?}", line)),
}
}

let metrics = FsTrimMetrics {
last_duration_milliseconds: last_duration_milliseconds.ok_or(format_err!(
"missing metric: {}",
Expand All @@ -131,6 +180,9 @@ where
last_run_success: last_run_success
.ok_or(format_err!("missing metric: {}", METRICS_LAST_RUN_SUCCESS))?,
total_runs: total_runs.ok_or(format_err!("missing metric: {}", METRICS_RUNS_TOTAL))?,
last_duration_milliseconds_datadir: datadir_last_duration_milliseconds,
last_run_success_datadir: datadir_last_run_success,
total_runs_datadir: datadir_total_runs,
};
if !metrics.are_valid() {
return Err(format_err!("parsed metrics are invalid"));
Expand All @@ -148,6 +200,12 @@ impl PartialEq for FsTrimMetrics {
other.last_duration_milliseconds,
)
&& (self.last_run_success == other.last_run_success)
&& f64_approx_eq(
self.last_duration_milliseconds_datadir,
other.last_duration_milliseconds_datadir,
)
&& (self.last_run_success_datadir == other.last_run_success_datadir)
&& f64_approx_eq(self.total_runs_datadir, other.total_runs_datadir)
}
}

Expand Down
43 changes: 39 additions & 4 deletions rs/ic_os/fstrim_tool/src/metrics/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use assert_matches::assert_matches;
use ic_crypto_test_utils_reproducible_rng::reproducible_rng;
use rand::Rng;
use std::fs::write;
use std::time::Duration;
use tempfile::tempdir;

#[test]
fn should_compare_f64() {
Expand All @@ -17,7 +19,7 @@ fn should_compare_f64() {

#[test]
fn should_parse_valid_metrics_file() {
let temp_dir = tempfile::TempDir::new().expect("failed to create a temporary directory");
let temp_dir = tempdir().expect("failed to create a temporary directory");
let test_file = temp_dir.as_ref().join("test_file");
let metrics_file_content =
"# HELP fstrim_last_run_duration_milliseconds Duration of last run of fstrim in milliseconds\n\
Expand All @@ -30,18 +32,22 @@ fn should_parse_valid_metrics_file() {
# TYPE fstrim_runs_total counter\n\
fstrim_runs_total 1\n";
write(&test_file, metrics_file_content).expect("error writing to file");

let parsed_metrics = parse_existing_metrics_from_file(&test_file.to_string_lossy()).unwrap();
let expected_metrics = FsTrimMetrics {
last_duration_milliseconds: 6.0,
last_run_success: true,
total_runs: 1.0,
last_duration_milliseconds_datadir: 0.0,
last_run_success_datadir: true,
total_runs_datadir: 0.0,
};
assert_eq!(parsed_metrics, Some(expected_metrics));
}

#[test]
fn should_only_consider_first_parsed_value_when_parsing_metrics_file() {
let temp_dir = tempfile::TempDir::new().expect("failed to create a temporary directory");
let temp_dir = tempdir().expect("failed to create a temporary directory");
let test_file = temp_dir.as_ref().join("test_file");
let metrics_file_content =
"# HELP fstrim_last_run_duration_milliseconds Duration of last run of fstrim in milliseconds\n\
Expand All @@ -57,11 +63,15 @@ fn should_only_consider_first_parsed_value_when_parsing_metrics_file() {
fstrim_runs_total 12\n\
fstrim_runs_total 1\n";
write(&test_file, metrics_file_content).expect("error writing to file");

let parsed_metrics = parse_existing_metrics_from_file(&test_file.to_string_lossy()).unwrap();
let expected_metrics = FsTrimMetrics {
last_duration_milliseconds: 6.0,
last_run_success: true,
total_runs: 12.0,
last_duration_milliseconds_datadir: 0.0,
last_run_success_datadir: true,
total_runs_datadir: 0.0,
};
assert_eq!(parsed_metrics, Some(expected_metrics));
}
Expand Down Expand Up @@ -142,6 +152,7 @@ fn should_set_metrics() {
last_duration_milliseconds: 110.0,
last_run_success: true,
total_runs: 1.0,
..FsTrimMetrics::default()
};
assert_eq!(existing_metrics, expected_metrics);
}
Expand All @@ -162,7 +173,7 @@ fn should_update_metrics() {
for _ in 0..100 {
let success = rng.gen_bool(0.5);
let duration = Duration::from_millis(rng.gen_range(0..15000));
update_metrics(&mut expected_metrics, success, duration);
update_metrics_locally(&mut expected_metrics, success, duration);
updated_metrics
.update(success, duration)
.expect("should update metrics successfully");
Expand All @@ -177,7 +188,8 @@ fn should_update_metrics() {
}
}

fn update_metrics(metrics: &mut FsTrimMetrics, success: bool, duration: Duration) {
// Simple local "update" for the test reference
fn update_metrics_locally(metrics: &mut FsTrimMetrics, success: bool, duration: Duration) {
metrics.total_runs += 1f64;
metrics.last_run_success = success;
metrics.last_duration_milliseconds = duration.as_millis() as f64;
Expand All @@ -198,6 +210,7 @@ fn should_update_metric_with_infinite_values() {
last_duration_milliseconds: duration.as_millis() as f64,
last_run_success: success,
total_runs: f64::INFINITY,
..FsTrimMetrics::default()
};

assert_eq!(existing_metrics, expected_metrics);
Expand All @@ -218,6 +231,7 @@ fn should_update_metric_with_nan_values() {
last_duration_milliseconds: duration.as_millis() as f64,
last_run_success: success,
total_runs: f64::NAN,
..FsTrimMetrics::default()
};

assert_eq!(existing_metrics, expected_metrics);
Expand All @@ -242,3 +256,24 @@ fn should_maintain_invariants() {
verify_invariants(i as f64, &existing_metrics);
}
}

#[test]
fn should_update_datadir_metrics() {
let mut metrics = FsTrimMetrics::default();
assert_eq!(metrics.total_runs_datadir, 0.0);
assert_eq!(metrics.last_duration_milliseconds_datadir, 0.0);
assert!(metrics.last_run_success_datadir);

metrics
.update_datadir(false, Duration::from_millis(123))
.expect("should update datadir metrics");

assert_eq!(metrics.total_runs_datadir, 1.0);
assert_eq!(metrics.last_duration_milliseconds_datadir, 123.0);
assert!(!metrics.last_run_success_datadir);

// Check that normal fields remain untouched
assert_eq!(metrics.total_runs, 0.0);
assert_eq!(metrics.last_duration_milliseconds, 0.0);
assert!(metrics.last_run_success);
}
Loading
Loading