Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(node): add fstrim datadir observability #3322

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions rs/ic_os/fstrim_tool/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ DEV_DEPENDENCIES = [
"@crate_index//:assert_matches",
"@crate_index//:predicates",
"@crate_index//:rand",
"@crate_index//:regex",
"@crate_index//:tempfile",
]

Expand Down
1 change: 1 addition & 0 deletions rs/ic_os/fstrim_tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ assert_matches = { workspace = true }
ic-crypto-test-utils-reproducible-rng = { path = "../../crypto/test_utils/reproducible_rng" }
predicates = { workspace = true }
rand = { workspace = true }
regex = { workspace = true }
tempfile = { workspace = true }
24 changes: 17 additions & 7 deletions rs/ic_os/fstrim_tool/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,12 @@ fn write_metrics_using_tmp_file(metrics: &FsTrimMetrics, metrics_filename: &str)
.context("Failed to write metrics to file")
}

fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) -> Result<()> {
fn update_metrics(
elapsed: Duration,
is_success: bool,
metrics_filename: &str,
is_datadir: bool,
) -> Result<()> {
let mut metrics = parse_existing_metrics_from_file(metrics_filename)
.unwrap_or_else(|e| {
eprintln!("error parsing existing metrics: {}", e);
Expand All @@ -71,7 +76,13 @@ fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) -
eprintln!("no existing metrics found");
FsTrimMetrics::default()
});
metrics.update(is_success, elapsed)?;

if is_datadir {
metrics.update_datadir(is_success, elapsed)?;
} else {
metrics.update(is_success, elapsed)?;
}

write_metrics_using_tmp_file(&metrics, metrics_filename)
}

Expand Down Expand Up @@ -101,14 +112,13 @@ pub fn fstrim_tool(
let start = std::time::Instant::now();
let res_target = run_command(command, &target);
let elapsed_target = start.elapsed();
update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename)?;
update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename, false)?;

if !datadir_target.is_empty() && !is_node_assigned() {
// TODO observability changes needed, expand the metrics logic
// let start_datadir = std::time::Instant::now();
let start = std::time::Instant::now();
let res_datadir = run_command(command, &datadir_target);
// let elapsed_datadir = start_datadir.elapsed();
// update_metrics(elapsed_datadir, res_datadir.is_ok(), &metrics_filename)?;
let elapsed = start.elapsed();
update_metrics(elapsed, res_datadir.is_ok(), &metrics_filename, true)?;
res_target.and(res_datadir)
} else {
res_target
Expand Down
88 changes: 76 additions & 12 deletions rs/ic_os/fstrim_tool/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,20 @@ const METRICS_LAST_RUN_DURATION_MILLISECONDS: &str = "fstrim_last_run_duration_m
const METRICS_LAST_RUN_SUCCESS: &str = "fstrim_last_run_success";
const METRICS_RUNS_TOTAL: &str = "fstrim_runs_total";

const METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR: &str =
"fstrim_datadir_last_run_duration_milliseconds";
const METRICS_LAST_RUN_SUCCESS_DATADIR: &str = "fstrim_datadir_last_run_success";
const METRICS_RUNS_TOTAL_DATADIR: &str = "fstrim_datadir_runs_total";

#[derive(Debug)]
pub struct FsTrimMetrics {
pub last_duration_milliseconds: f64,
pub last_run_success: bool,
pub total_runs: f64,

pub last_duration_milliseconds_datadir: f64,
pub last_run_success_datadir: bool,
pub total_runs_datadir: f64,
}

impl Default for FsTrimMetrics {
Expand All @@ -21,6 +30,10 @@ impl Default for FsTrimMetrics {
last_duration_milliseconds: 0f64,
last_run_success: true,
total_runs: 0f64,

last_duration_milliseconds_datadir: 0f64,
last_run_success_datadir: true,
total_runs_datadir: 0f64,
}
}
}
Expand All @@ -33,26 +46,54 @@ impl FsTrimMetrics {
Ok(())
}

pub(crate) fn update_datadir(&mut self, success: bool, duration: Duration) -> Result<()> {
self.total_runs_datadir += 1f64;
self.last_run_success_datadir = success;
self.last_duration_milliseconds_datadir = duration.as_millis() as f64;
Ok(())
}

pub fn to_p8s_metrics_string(&self) -> String {
let fstrim_last_run_duration_milliseconds = to_go_f64(self.last_duration_milliseconds);
let fstrim_last_run_success = if self.last_run_success { "1" } else { "0" };
let fstrim_runs_total = to_go_f64(self.total_runs);

let fstrim_datadir_last_run_duration_milliseconds =
to_go_f64(self.last_duration_milliseconds_datadir);
let fstrim_datadir_last_run_success = if self.last_run_success_datadir {
"1"
} else {
"0"
};
let fstrim_datadir_runs_total = to_go_f64(self.total_runs_datadir);

format!(
"# HELP fstrim_last_run_duration_milliseconds Duration of last run of fstrim in milliseconds\n\
# TYPE fstrim_last_run_duration_milliseconds gauge\n\
fstrim_last_run_duration_milliseconds {}\n\
fstrim_last_run_duration_milliseconds {fstrim_last_run_duration_milliseconds}\n\
# HELP fstrim_last_run_success Success status of last run of fstrim (success: 1, failure: 0)\n\
# TYPE fstrim_last_run_success gauge\n\
fstrim_last_run_success {}\n\
fstrim_last_run_success {fstrim_last_run_success}\n\
# HELP fstrim_runs_total Total number of runs of fstrim\n\
# TYPE fstrim_runs_total counter\n\
fstrim_runs_total {}\n",
to_go_f64(self.last_duration_milliseconds),
if self.last_run_success { "1" } else { "0" },
to_go_f64(self.total_runs),
).to_string()
fstrim_runs_total {fstrim_runs_total}\n\
# HELP fstrim_datadir_last_run_duration_milliseconds Duration of last run of fstrim on datadir in milliseconds\n\
# TYPE fstrim_datadir_last_run_duration_milliseconds gauge\n\
fstrim_datadir_last_run_duration_milliseconds {fstrim_datadir_last_run_duration_milliseconds}\n\
# HELP fstrim_datadir_last_run_success Success status of last run of fstrim on datadir (success: 1, failure: 0)\n\
# TYPE fstrim_datadir_last_run_success gauge\n\
fstrim_datadir_last_run_success {fstrim_datadir_last_run_success}\n\
# HELP fstrim_datadir_runs_total Total number of runs of fstrim on datadir\n\
# TYPE fstrim_datadir_runs_total counter\n\
fstrim_datadir_runs_total {fstrim_datadir_runs_total}\n"
)
}

fn are_valid(&self) -> bool {
is_f64_finite_and_0_or_larger(self.total_runs)
&& is_f64_finite_and_0_or_larger(self.last_duration_milliseconds)
&& is_f64_finite_and_0_or_larger(self.total_runs_datadir)
&& is_f64_finite_and_0_or_larger(self.last_duration_milliseconds_datadir)
}
}

Expand Down Expand Up @@ -102,27 +143,41 @@ where
let mut last_duration_milliseconds: Option<f64> = None;
let mut last_run_success: Option<bool> = None;
let mut total_runs: Option<f64> = None;

// Default datadir fields (we treat them as optional in the metrics file)
let mut datadir_last_duration_milliseconds: f64 = 0f64;
let mut datadir_last_run_success: bool = true;
let mut datadir_total_runs: f64 = 0f64;

for line_or_err in lines {
let line = line_or_err.map_err(|e| format_err!("failed to read line: {}", e))?;
match line.split(' ').collect::<Vec<_>>()[..] {
["#", ..] => continue,
[key, value] => match key {
METRICS_LAST_RUN_DURATION_MILLISECONDS => {
let _ = last_duration_milliseconds
.get_or_insert(parse_metrics_value(key, value)?);
last_duration_milliseconds.get_or_insert(parse_metrics_value(key, value)?);
}
METRICS_LAST_RUN_SUCCESS => {
let _ =
last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64);
last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64);
}
METRICS_RUNS_TOTAL => {
let _ = total_runs.get_or_insert(parse_metrics_value(key, value)?);
total_runs.get_or_insert(parse_metrics_value(key, value)?);
}
METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR => {
datadir_last_duration_milliseconds = parse_metrics_value(key, value)?;
}
METRICS_LAST_RUN_SUCCESS_DATADIR => {
datadir_last_run_success = parse_metrics_value(key, value)? > 0f64;
}
METRICS_RUNS_TOTAL_DATADIR => {
datadir_total_runs = parse_metrics_value(key, value)?;
}
_ => return Err(format_err!("unknown metric key: {}", key)),
},
_ => return Err(format_err!("invalid metric line: {:?}", line)),
}
}

let metrics = FsTrimMetrics {
last_duration_milliseconds: last_duration_milliseconds.ok_or(format_err!(
"missing metric: {}",
Expand All @@ -131,6 +186,9 @@ where
last_run_success: last_run_success
.ok_or(format_err!("missing metric: {}", METRICS_LAST_RUN_SUCCESS))?,
total_runs: total_runs.ok_or(format_err!("missing metric: {}", METRICS_RUNS_TOTAL))?,
last_duration_milliseconds_datadir: datadir_last_duration_milliseconds,
last_run_success_datadir: datadir_last_run_success,
total_runs_datadir: datadir_total_runs,
};
if !metrics.are_valid() {
return Err(format_err!("parsed metrics are invalid"));
Expand All @@ -148,6 +206,12 @@ impl PartialEq for FsTrimMetrics {
other.last_duration_milliseconds,
)
&& (self.last_run_success == other.last_run_success)
&& f64_approx_eq(
self.last_duration_milliseconds_datadir,
other.last_duration_milliseconds_datadir,
)
&& (self.last_run_success_datadir == other.last_run_success_datadir)
&& f64_approx_eq(self.total_runs_datadir, other.total_runs_datadir)
}
}

Expand Down
Loading
Loading